In [5]:
import re
import tweepy
import config as cf
from json import loads
from datetime import datetime
import pandas as pd
from os import path
from pymongo import MongoClient,UpdateOne
import config as cf
import emoji
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from string import punctuation 

eval_obj = SentimentIntensityAnalyzer()
api_key = cf.twitter_apikey
api_key_secret = cf.twitter_secretapikey
access_token = cf.twitter_accesstoken
access_token_secret = cf.twitter_accesstokensecret
auth = tweepy.OAuthHandler(api_key, api_key_secret)

# set access to user's access key and access secret
auth.set_access_token(access_token, access_token_secret)

# calling the api
api = tweepy.API(auth,wait_on_rate_limit=True)
stats = api.rate_limit_status()



def load_db(df,table_name):
    db=MongoClient(cf.mdb_string)
    mydatabase=db[cf.mdb_db]

    dump_df=df 
    updates = []
    for _, row in dump_df.iterrows():
        if table_name=="Trends":
            updates.append(UpdateOne({'Trend_Name': row.get('Trend_Name'), 'WOEID': row.get('WOEID')},
            {
                '$set': {
                    'Search_Query': row.get('Search_Query'),
                    'Total_Tweets_Count': row.get('Total_Tweets_Count'),
                    'Pulled_At': row.get('Pulled_At')
                    },
                "$currentDate": {"lastModified": True}
            }, upsert=True))
        elif table_name=="Tweets":
            updates.append(UpdateOne({'tweet_id': row.get('tweet_id'), 'WOEID': row.get('WOEID')},
            {
                '$set': {
                    'tweet_created_at': row.get('tweet_created_at'),
                    'tweet_raw_text': row.get('tweet_raw_text'),
                    'tweet_cleaned_text': row.get('tweet_cleaned_text'),
                    'tweet_emojis': row.get('tweet_emojis'),
                    'tweet_hashtags': row.get('tweet_hashtags'),
                    'tweet_compound_score':row.get('tweet_compound_score'),
                    'tweet_lang': row.get('tweet_lang'),
                    'tweet_source': row.get('tweet_source'),
                    'tweet_retweet_count': row.get('tweet_retweet_count'),
                    'tweet_like_count': row.get('tweet_like_count'),
                    'tweet_search_query': row.get('tweet_search_query'),
                    'tweet_scrapped_datetime_utc': row.get('tweet_scrapped_datetime_utc')
                },
                "$currentDate": {"lastModified": True}
            }, upsert=True))
    
    mydatabase[table_name].bulk_write(updates)
    result = mydatabase[table_name].bulk_write(updates)
    result = result.bulk_api_result
    # print("Summary Inserted: {} Updated: {} Errors: {}".format(
    #     max(result['nInserted'], result['nUpserted']), result['nModified'], max(result['writeErrors'], result['writeConcernErrors'])))
    db.close()

def cleanTweet(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # remove @mentions
    text = re.sub(r'\#', '', text)  # remove hash tag
    text = re.sub(r'RT[\s]+', '', text)  # remove RT
    text = re.sub(r'https?:\/\/\S+', '', text)  # remove hyperlink
    text = re.sub(r'\n', ' ', text)  # remove next line character
    text = re.sub(r'[0-9]','',text) # Remove numbers
    text = ''.join([char for char in text if char not in punctuation ]) # remove punctuations
    text = text.strip() #remove whitespaces
    return text


def read_location(filepath):
    
    if not path.exists(filepath):
        raise FileExistsError("File Doesnot exists!! Please check filepath")
    locations_json=open(filepath,"r")
    locations_json=loads(locations_json.read())
    return locations_json


def pull_trends(woeid,max_trends=50):
    trends_list = []
    trends=api.get_place_trends(woeid)
    for i in range(min(len(trends[0]['trends']),max_trends)):
        # if no tweets counts return by API skip it
        if not trends[0]['trends'][i]['tweet_volume']:
            continue
        trends_list.append(
            [trends[0]['trends'][i]['name'], trends[0]['trends'][i]['query'], trends[0]['trends'][i]['tweet_volume'], woeid, datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")])
    return trends_list


def pull_popular_tweets(topics_list, woeid, max_tweet_topic=20,total_max_tweets=100):

    df_return = pd.DataFrame()

    for topic in topics_list:
        if len(df_return.index) == total_max_tweets:
            break
        tweets = tweepy.Cursor(api.search_tweets, q=topic[1],                         # search for each trending topic
                               lang="en", result_type='popular',                      # tweets in english , type is "recent"/"popular"
                               tweet_mode='extended',count=50).items(max_tweet_topic)
        tweets_list = list(tweets)
        print("Topic: "+topic[0]+" Pulled_Tweets: "+str(len(tweets_list)))
        for tweet in tweets_list:
            clean_text = cleanTweet(tweet.full_text)
            temp_dict = {
                'tweet_id': tweet.id,
                'WOEID': woeid,
                'tweet_created_at': tweet.created_at.strftime("%Y-%m-%d %H:%M:%S"),
                'tweet_raw_text': tweet.full_text.replace("'", "''"),
                'tweet_cleaned_text': clean_text,
                'tweet_emojis': (lambda x: ', '.join(c for c in x if c in emoji.EMOJI_DATA))(clean_text),
                'tweet_hashtags': (lambda x: ", ".join(re.findall(r"#(\w+)", x)))(tweet.full_text),
                'tweet_compound_score': (lambda x: eval_obj.polarity_scores(x)['compound']) (clean_text),
                'tweet_lang': tweet.lang,
                'tweet_source': tweet.source,
                'tweet_retweet_count': tweet.retweet_count,
                'tweet_like_count': tweet.favorite_count,
                'tweet_search_query': topic[0],
                'tweet_scrapped_datetime_utc': datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
            }
            temp_df = pd.DataFrame([temp_dict])
            df_return = pd.concat([df_return, temp_df], ignore_index=True)
    print("Total Tweets Pulled: "+str(len(df_return.index)))
    return df_return


def build_dataset():
    records_trend=0
    records_tweets=0
    locations=read_location("Locations.json")
    for location in locations:
        final_tweets_df = pd.DataFrame()
        final_trends_df = pd.DataFrame()
        print("\nScraping Tweets for Location: "+location['name'])
        trend_list=pull_trends(location['woeid'],max_trends=20)
        trend_df = pd.DataFrame(trend_list, columns=[
                                'Trend_Name', 'Search_Query', 'Total_Tweets_Count', 'WOEID', 'Pulled_At'])
        trend_df = trend_df.fillna(0)
        
        final_trends_df = pd.concat([final_trends_df, trend_df], ignore_index=True)

        final_tweets_df=pd.concat([final_tweets_df,
                            pull_popular_tweets(trend_list,location['woeid'],max_tweet_topic=30,total_max_tweets=10000)]
                            ,ignore_index = True)
        if len(final_tweets_df.index)==0:
            continue
        trend_df = pd.merge(trend_df
                    , final_tweets_df.groupby(['WOEID', 'tweet_search_query']).size().reset_index(name='counts')   
                    , how="left", left_on=['WOEID', 'Trend_Name'], right_on=['WOEID', 'tweet_search_query'])
        trend_df=trend_df.dropna()
        records_trend+=len(final_trends_df.index)
        records_tweets+=len(final_tweets_df.index)
        load_db(final_tweets_df,"Tweets")
        load_db(final_trends_df,"Trends")
    
    print("Total Records Pulled\nTweets: {} Trends: {}".format(records_tweets,records_trend))
    # return final_tweets_df,final_trends_df

In [6]:
build_dataset()
# temp_tweet_df,trend_temp_df=build_dataset()
# load_db(temp_tweet_df,"Tweets")
# load_db(trend_temp_df,"Trends")


Scraping Tweets for Location: United States
Topic: Cowboys Pulled_Tweets: 30
Topic: Texans Pulled_Tweets: 30
Topic: Bears Pulled_Tweets: 30
Topic: Brazil Pulled_Tweets: 30
Topic: Dolphins Pulled_Tweets: 30
Topic: Steelers Pulled_Tweets: 30
Topic: Jets Pulled_Tweets: 30
Topic: Lovie Smith Pulled_Tweets: 20
Topic: Colts Pulled_Tweets: 30
Topic: #FinsUp Pulled_Tweets: 21
Topic: Rams Pulled_Tweets: 30
Topic: Browns Pulled_Tweets: 30
Topic: #HereWeGo Pulled_Tweets: 30
Topic: Patriots Pulled_Tweets: 30
Topic: #FlyEaglesFly Pulled_Tweets: 23
Total Tweets Pulled: 424

Scraping Tweets for Location: Canada
Topic: Brazil Pulled_Tweets: 30
Topic: Lula Pulled_Tweets: 30
Topic: Jets Pulled_Tweets: 30
Topic: #BillsMafia Pulled_Tweets: 30
Topic: Texans Pulled_Tweets: 30
Topic: Bears Pulled_Tweets: 30
Topic: #FinsUp Pulled_Tweets: 21
Topic: Cowboys Pulled_Tweets: 30
Topic: Dolphins Pulled_Tweets: 30
Topic: #MohammadGhobadlou Pulled_Tweets: 15
Topic: Hines Pulled_Tweets: 30
Topic: Lovie Smith Pulled_Tw

Rate limit reached. Sleeping for: 806


Topic: Florida Pulled_Tweets: 30
Topic: Colts Pulled_Tweets: 30
Topic: Bears Pulled_Tweets: 30
Topic: Texans Pulled_Tweets: 30
Topic: THEY CAN'T BEAT US Pulled_Tweets: 4
Total Tweets Pulled: 276

Scraping Tweets for Location: Kenya
Topic: Chelsea Pulled_Tweets: 30
Topic: #MCICHE Pulled_Tweets: 14
Topic: Potter Pulled_Tweets: 30
Topic: Tuchel Pulled_Tweets: 29
Topic: LGBTQ Pulled_Tweets: 30
Topic: #FACup Pulled_Tweets: 29
Topic: Mahrez Pulled_Tweets: 30
Topic: Gallagher Pulled_Tweets: 30
Topic: Blessed Sunday Pulled_Tweets: 16
Topic: Todd Boehly Pulled_Tweets: 30
Topic: Barca Pulled_Tweets: 30
Total Tweets Pulled: 298

Scraping Tweets for Location: United Kingdom
Topic: #HarryTheInterview Pulled_Tweets: 14
Topic: Brazil Pulled_Tweets: 30
Topic: Oprah Pulled_Tweets: 30
Topic: Stevenage Pulled_Tweets: 29
Topic: Chelsea Pulled_Tweets: 30
Topic: Camilla Pulled_Tweets: 30
Topic: Savic Pulled_Tweets: 14
Topic: Lula Pulled_Tweets: 30
Topic: Texans Pulled_Tweets: 30
Topic: Roma Pulled_Tweets: 3

Rate limit reached. Sleeping for: 805


Topic: Dembele Pulled_Tweets: 27
Topic: Mahrez Pulled_Tweets: 30
Topic: Xavi Pulled_Tweets: 24
Topic: Barca Pulled_Tweets: 30
Topic: Ferran Pulled_Tweets: 23
Topic: Atletico Pulled_Tweets: 28
Topic: Alvarez Pulled_Tweets: 27
Total Tweets Pulled: 332

Scraping Tweets for Location: Peru
Topic: Lula Pulled_Tweets: 30
Topic: Savic Pulled_Tweets: 15
Topic: Junior Pulled_Tweets: 30
Topic: Juliaca Pulled_Tweets: 0
Topic: Andrade Pulled_Tweets: 9
Topic: Ferran Pulled_Tweets: 23
Topic: #TEMPTATION Pulled_Tweets: 10
Topic: #TXT_Daydream Pulled_Tweets: 4
Topic: Brasilia Pulled_Tweets: 30
Topic: Pedri Pulled_Tweets: 26
Total Tweets Pulled: 177

Scraping Tweets for Location: New Zealand
Topic: Brazil Pulled_Tweets: 30
Topic: Democracy Pulled_Tweets: 30
Topic: FA Cup Pulled_Tweets: 30
Topic: Florida Pulled_Tweets: 30
Topic: McCarthy Pulled_Tweets: 30
Topic: Lakers Pulled_Tweets: 30
Topic: Andrew Tate Pulled_Tweets: 30
Topic: Elliot Pulled_Tweets: 29
Topic: Potter Pulled_Tweets: 30
Topic: Michael Pul

Rate limit reached. Sleeping for: 793


Topic: Kylian Pulled_Tweets: 27
Topic: #Bresil Pulled_Tweets: 0
Topic: Araujo Pulled_Tweets: 19
Topic: Lula Pulled_Tweets: 30
Topic: Deschamps Pulled_Tweets: 17
Topic: Ferran Pulled_Tweets: 23
Topic: Benzema Pulled_Tweets: 28
Topic: Xavi Pulled_Tweets: 24
Topic: Savic Pulled_Tweets: 15
Topic: #brasilia Pulled_Tweets: 15
Topic: #AtletiBarca Pulled_Tweets: 13
Topic: Molina Pulled_Tweets: 10
Total Tweets Pulled: 262

Scraping Tweets for Location: Germany
Topic: #MohammadGhobadlou Pulled_Tweets: 19
Topic: #MohammadBroghani Pulled_Tweets: 16
Topic: Lula Pulled_Tweets: 30
Topic: Dolphins Pulled_Tweets: 30
Topic: Sauna Pulled_Tweets: 12
Topic: Texans Pulled_Tweets: 30
Topic: THEY CAN'T BEAT US Pulled_Tweets: 4
Topic: Bears Pulled_Tweets: 30
Topic: Playoffs Pulled_Tweets: 30
Total Tweets Pulled: 201

Scraping Tweets for Location: Turkey
Topic: #FBvGS Pulled_Tweets: 8
Topic: Galatasaray Pulled_Tweets: 13
Topic: Ali Koç Pulled_Tweets: 0
Topic: Fenerbahçe Pulled_Tweets: 21
Topic: #Eytseçimibekliy

In [12]:
df=pd.DataFrame()
len(df.index)

0

In [15]:
def build_dataset():
    records_trend=0
    records_tweets=0
    locations=read_location("Locations.json")
    final_tweets_df = pd.DataFrame()
    final_trends_df = pd.DataFrame()
    for location in locations:
        
        print("\nScraping Tweets for Location: "+location['name'])
        trend_list=pull_trends(location['woeid'],max_trends=20)
        trend_df = pd.DataFrame(trend_list, columns=[
                                'Trend_Name', 'Search_Query', 'Total_Tweets_Count', 'WOEID', 'Pulled_At'])
        trend_df = trend_df.fillna(0)
        
        final_trends_df = pd.concat([final_trends_df, trend_df], ignore_index=True)

        final_tweets_df=pd.concat([final_tweets_df,
                            pull_popular_tweets(trend_list,location['woeid'],max_tweet_topic=30,total_max_tweets=10000)]
                            ,ignore_index = True)

        trend_df = pd.merge(trend_df
                   , final_tweets_df.groupby(['WOEID', 'tweet_search_query']).size().reset_index(name='counts')   
                   , how="left", left_on=['WOEID', 'Trend_Name'], right_on=['WOEID', 'tweet_search_query'])
        trend_df=trend_df.dropna()
        print(trend_df)
        records_trend+=len(final_trends_df.index)
        records_tweets+=len(final_tweets_df.index)
        # load_db(final_tweets_df,"Tweets3")
        # load_db(final_trends_df,"Trends3")
    
    print("Total Records Pulled\nTweets: {} Trends: {}".format(records_tweets,records_trend))
    return final_tweets_df,final_trends_df
temp_tweet_df,trend_temp_df=build_dataset()


Scraping Tweets for Location: Germany
Topic: #3K23 Pulled_Tweets: 0
Topic: #DieZukunftGlaubtAnUns Pulled_Tweets: 0
Topic: Feiertag Pulled_Tweets: 0
Topic: #Marder Pulled_Tweets: 11
Topic: Vornamen Pulled_Tweets: 0
Topic: Panzer Pulled_Tweets: 3
Topic: Kassiererin Pulled_Tweets: 0
Topic: #FreeNella Pulled_Tweets: 0
Topic: #MohammadBroghani Pulled_Tweets: 4
Topic: Kriegspartei Pulled_Tweets: 0
Topic: Kriegstreiber Pulled_Tweets: 0
Topic: Waffenruhe Pulled_Tweets: 0
Topic: Bert Pulled_Tweets: 16
Topic: AND MY PLEASURE Pulled_Tweets: 3
Topic: start ins wochenende Pulled_Tweets: 0
Topic: Russland Pulled_Tweets: 3
Topic: Erfrischungsstäbchen Pulled_Tweets: 0
Topic: Könige Pulled_Tweets: 0
Topic: Bewerbungen Pulled_Tweets: 0
Topic: Paula Pulled_Tweets: 14
Total Tweets Pulled: 54
           Trend_Name           Search_Query  Total_Tweets_Count     WOEID  \
3             #Marder              %23Marder                 0.0  23424829   
5              Panzer                 Panzer             153

Rate limit reached. Sleeping for: 580


In [3]:
from pymongo import MongoClient
import pandas as pd
import config

client = MongoClient(config.mdb_string)
mydatabase = client[config.mdb_db]
collection = mydatabase["Tweets"]
all_record = collection.find()

df_tweets = pd.DataFrame(list(collection.find()))
df_tweets = df_tweets.drop(columns=['_id'])

collection = mydatabase["Trends"]
all_record = collection.find()

df_trends = pd.DataFrame(list(collection.find()))
df_trends = df_trends.drop(columns=['_id'])
df_trends.columns

Index(['Trend_Name', 'WOEID', 'Pulled_At', 'Search_Query',
       'Total_Tweets_Count', 'lastModified'],
      dtype='object')

In [5]:
temp_df = pd.merge(df_trends
                   , df_tweets.groupby(['WOEID', 'tweet_search_query']).size().reset_index(name='counts')   
                   , how="left", left_on=['WOEID', 'Trend_Name'], right_on=['WOEID', 'tweet_search_query'])
df_trends=temp_df.dropna()
df_trends=df_trends[['Trend_Name', 'WOEID', 'Pulled_At', 'Search_Query',
           'Total_Tweets_Count', 'lastModified']]
df_trends


Unnamed: 0,Trend_Name,WOEID,Pulled_At,Search_Query,Total_Tweets_Count,lastModified
0,Pistons,23424977,2023-01-05 06:37:20,Pistons,0.0,2023-01-07 03:02:02.545
1,Hannity,23424977,2023-01-05 06:37:20,Hannity,52087.0,2023-01-07 03:02:02.545
2,#LakeShow,23424977,2023-01-05 06:37:20,%23LakeShow,0.0,2023-01-07 03:02:02.545
3,Klay,23424977,2023-01-05 06:37:20,Klay,28488.0,2023-01-07 03:02:02.545
4,Poole,23424977,2023-01-05 06:37:20,Poole,14481.0,2023-01-07 03:02:02.545
...,...,...,...,...,...,...
1094,Wagner,23424976,2023-01-07 17:57:36,Wagner,34740.0,2023-01-07 17:57:52.060
1095,Earth,23424976,2023-01-07 17:57:36,Earth,185230.0,2023-01-07 17:57:52.061
1096,#Bakhmut,23424976,2023-01-07 17:57:36,%23Bakhmut,11836.0,2023-01-07 17:57:52.061
1097,Jesus,23424976,2023-01-07 17:57:36,Jesus,245974.0,2023-01-07 17:57:52.062


In [9]:
temp_df = pd.merge(df_trends
                   , df_tweets.groupby(['WOEID', 'tweet_search_query']).size().reset_index(name='counts')   
                   , how="left", left_on=['WOEID', 'Trend_Name'], right_on=['WOEID', 'tweet_search_query'])
temp_df[temp_df.isna().any(axis=1)]

Unnamed: 0,Trend_Name,WOEID,Pulled_At,Search_Query,Total_Tweets_Count,lastModified,tweet_search_query,counts
13,#dreamspace,23424977,2023-01-05 06:37:20,%23dreamspace,0.0,2023-01-05 06:37:35.670,,
32,#dreamspace,23424775,2023-01-05 06:37:35,%23dreamspace,0.0,2023-01-05 06:37:48.921,,
35,#CanadaVsUSA,23424775,2023-01-05 06:37:35,%23CanadaVsUSA,0.0,2023-01-05 06:37:48.921,,
43,Airtel5G Plus In Bhubaneswar,23424848,2023-01-05 06:37:49,%22Airtel5G+Plus+In+Bhubaneswar%22,0.0,2023-01-05 06:37:58.504,,
47,PoliticalWill Lacking InSSRCs,23424848,2023-01-05 06:37:49,%22PoliticalWill+Lacking+InSSRCs%22,21586.0,2023-01-05 06:37:58.505,,
...,...,...,...,...,...,...,...,...
1358,Prens Harry,23424969,2023-01-07 02:09:24,%22Prens+Harry%22,0.0,2023-01-07 02:09:28.772,,
1359,#Ukraina,23424976,2023-01-07 02:09:28,%23Ukraina,0.0,2023-01-07 02:09:43.267,,
1364,може,23424976,2023-01-07 02:09:28,%D0%BC%D0%BE%D0%B6%D0%B5,10595.0,2023-01-07 02:09:43.270,,
1365,#cellulite,23424976,2023-01-07 02:09:28,%23cellulite,0.0,2023-01-07 02:09:43.271,,


In [3]:
df_trends


Unnamed: 0,Trend_Name,WOEID,Pulled_At,Search_Query,Total_Tweets_Count,lastModified
0,#LBCPSG,23424819,2023-01-07 02:14:06,%23LBCPSG,14907.0,2023-01-07 02:14:14.183
1,Châteauroux,23424819,2023-01-07 02:14:06,Ch%C3%A2teauroux,20900.0,2023-01-07 02:14:14.183
2,Gharbi,23424819,2023-01-07 02:14:06,Gharbi,0.0,2023-01-07 02:14:14.183
3,Sarabia,23424819,2023-01-07 02:14:06,Sarabia,0.0,2023-01-07 02:14:14.183
4,#CannesComedyShow,23424819,2023-01-07 02:14:06,%23CannesComedyShow,0.0,2023-01-07 02:14:14.183
5,#VendrediLecture,23424819,2023-01-07 02:14:06,%23VendrediLecture,0.0,2023-01-07 02:14:14.183
6,#Epiphanie,23424819,2023-01-07 02:14:06,%23Epiphanie,0.0,2023-01-07 02:14:14.183
7,#CoupeDeFrance,23424819,2023-01-07 02:14:06,%23CoupeDeFrance,0.0,2023-01-07 02:14:14.184
8,Bernat,23424819,2023-01-07 02:14:06,Bernat,0.0,2023-01-07 02:14:14.184
9,Emery,23424819,2023-01-07 02:14:06,Emery,11731.0,2023-01-07 02:14:14.184


In [None]:
trends[0]['trends']

In [None]:
from snscrape.modules.twitter import TwitterTrendsScraper
import snscrape.modules.twitter as snstwitter

def pull_popular_tweets(topics_list,woeid,maximum_tweets):
    
    for topic in topics_list:
        for tweet_cnt, tweets in enumerate(snstwitter.TwitterSearchScraper(topic).get_items()):
            temp_dict = {
                'tweet_id': tweets.id,
                'woeid': woeid,
                'tweet_created_at': tweets.date.strftime("%Y-%m-%d %H:%M:%S"),
                'tweet_text': tweets.rawContent.replace("'", "''"),
                'tweet_lang': tweets.lang,
                'tweet_source': tweets.sourceLabel,
                'tweets_reply_count': tweets.replyCount,
                'tweets_retweet_count': tweets.retweetCount,
                'tweets_like_count': tweets.likeCount,
                'tweets_search_query': trend.name,
                'tweets_location': tweets.coordinates,
                'tweets_scrapped_datetime': datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
            }
    

In [None]:
def grab_popular_tweets(topic_list, max_tweets):             
    
    columns = [ 'pulled_at', 'created_at', 'username', 'user_location', 'region', 'search_type', 
               'trending_topic', 'retweetcount', 'favorites', 'text', 'hashtags', 'emojis']         # set up columns for dataframes   
    
    tweets_data_grab = pd.DataFrame(columns = columns)                                  # create empty dataframe    
        
    for topic in topic_list:                # loop though each trending topic
                                                            
                                                                                # grab tweets with Cursor
        tweets = tweepy.Cursor(api.search, q = topic,                           # search for each trending topic                                 
                         lang="en", result_type = 'popular',                    # tweets in english , type is "recent"/"popular"
                          tweet_mode = 'extended').items(max_tweets)            # longer tweets,  grab max_tweets number of tweets
        
        tweet_list = [tweet for tweet in tweets]                                # create list of tweets
                    
        tweets_topic = pd.DataFrame(columns = columns)         # create dataframe to put in current top tweets for this town and trending topic
            
        for tweet in tweet_list:                                      # loop through each tweet that was grabbed
            
            username = tweet.user.screen_name                                    # store username
            user_location = tweet.user.location                                  # store location of user
            retweetcount = tweet.retweet_count                                   # store retweet count
            favorites = tweet.favorite_count                                     # store favorite count
            hashtags = [h['text'].lower() for h in tweet.entities['hashtags']]   # store hashtags    
            search_type = 'popular'                                              # store search type
            region = "USA"                                                       # trending tweets in USA
            created_at = tweet.created_at                                        # time tweet created
            pulled_at = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")    # time tweet was pulled
        
            try:                              
                text = tweet.retweeted_status.full_text    # store text if it's a retweet
            
            except AttributeError: 
                text = tweet.full_text                     # store text if it's a regular tweet
                
            emoji = list(emojis.get(text))                 # get the emojis
            
            curr_tweet = [pulled_at, created_at, username, user_location, region,     # store current tweet's data in a list soon to be a row
                          search_type, topic, retweetcount, favorites, text, hashtags, emoji]                             
        
            tweets_topic.loc[len(tweets_topic)] = curr_tweet                         # add current tweet data to dataframe for town and topic         
                                
        tweets_topic.sort_values(by=['retweetcount', 'favorites'], inplace = True, ascending = False)     # sort the retweet values highest first
                                
        tweets_data_grab = pd.concat([tweets_data_grab, tweets_topic], ignore_index = True, sort = False)       # concatenate top n to final dataframe
        
    return tweets_data_grab

In [None]:
from snscrape.modules.twitter import TwitterTrendsScraper
import snscrape.modules.twitter as snstwitter
from datetime import datetime
import pandas as pd
import os 
import mysql.connector
import config as cf

mydb = mysql.connector.connect(
  host="localhost",
  port=3306,
  user="sysdba",
  password="!mK!ngP@t3",
  database = "StageTweets_Info"
  )
mycursor = mydb.cursor()


for i, trend in enumerate(TwitterTrendsScraper().get_items()):
    print(trend.json())
    print(trend.name)
    print(trend.domainContext)
    for tweet_cnt, tweets in enumerate(snstwitter.TwitterSearchScraper(trend.name).get_items()):
        temp_dict={
            'tweet_id': tweets.id,
            'woeid' : 0000000,
            'tweet_created_at': tweets.date.strftime("%Y-%m-%d %H:%M:%S"),
            'tweet_text': tweets.rawContent.replace("'","''"),
            'tweet_lang': tweets.lang,
            'tweet_source': tweets.sourceLabel,
            'tweets_reply_count': tweets.replyCount,
            'tweets_retweet_count': tweets.retweetCount,
            'tweets_like_count': tweets.likeCount,
            'tweets_search_query': trend.name,
            'tweets_location': tweets.coordinates,
            'tweets_scrapped_datetime': datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        values = ', '.join("'" + str(x).replace('/', '_') +
                           "'" for x in temp_dict.values())

        sql = "INSERT INTO scraped_tweets_stage VALUES ({})".format(values)
        mycursor.execute(sql)
        
        if tweet_cnt == 1000:
            break
    if i==20:
        break
mydb.commit()


In [None]:
mydb.commit()


In [None]:
columns = ', '.join("'" + str(x).replace('/', '_') +
                    "'" for x in temp_dict.keys())
columns


In [None]:
import json

file=open("woeid.json","r")

json_file=json.loads(file.read())
required_content=[]
for content in json_file:
    if content['placeType']['name']=='Country':
        required_content.append(content)

len(required_content)

In [None]:
import json
with open("Locations.json","w") as outfile:
    json.dump(required_content,outfile,indent=4)

In [None]:
import json

file=open("Locations.json","r")

json_file=json.loads(file.read())
len(json_file)

In [90]:
from pymongo import MongoClient
from pymongo import UpdateOne
import config
from datetime import datetime



In [88]:
vars(db)

{'_BaseObject__codec_options': CodecOptions(document_class=dict, tz_aware=False, uuid_representation=UuidRepresentation.UNSPECIFIED, unicode_decode_error_handler='strict', tzinfo=None, type_registry=TypeRegistry(type_codecs=[], fallback_encoder=None), datetime_conversion=DatetimeConversion.DATETIME),
 '_BaseObject__read_preference': Primary(),
 '_BaseObject__write_concern': WriteConcern(),
 '_BaseObject__read_concern': ReadConcern(),
 '_Database__name': 'stage_tweets',
 '_Database__client': MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True),
 '_timeout': None}