In [1]:
import GetOldTweets3 as got3
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
import re
import nltk
import time

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 350)

I will extract top tweets for the past 10 years. 100 top tweets will be extracted each month on the specified topic for past 10 years

In [8]:

def get_tweets(topic, end_date, years=13):
    columns = ['id', 'url', 'author', 'retweets', 'favorites', 'mentions', 'hashtags', 'geo', 'time', 'text']
    text_query = topic
    
    # convert the end_date string to the datetime object
    end_date_datetime = datetime.strptime(end_date, '%Y-%m-%d')

    # This section is to create a list of datetime objects that are 1 month apart going backwards
    date_range = [end_date_datetime]
    total_days = int(365 * years)
    
    prior_date = end_date_datetime
    
    while total_days > 0:
        prior_date = prior_date - relativedelta(days=1)
        date_range.append(prior_date)
        total_days -= 1
        
    # Convert the elements in the list from datetime objects to string
    date_range_string = [twitter_date.strftime('%Y-%m-%d') for twitter_date in date_range][::-1]
    print("Start querying data...")
    
    # Initialize an empty list to store dataframe from each iteration
    compiled_tweets_df = pd.DataFrame(columns=columns)
    
    start_time = time.perf_counter()
    for i in range(len(date_range_string)-1):
        if i i%30==1:
            iteration_time = time.perf_counter()
        tweetCriteria = got3.manager\
                .TweetCriteria()\
                .setQuerySearch(text_query)\
                .setLang('en')\
                .setSince(date_range_string[i])\
                .setUntil(date_range_string[i+1])\
                .setTopTweets(True)\
                .setMaxTweets(10)\
                .setEmoji("unicode")

        tweets = got3.manager.TweetManager.getTweets(tweetCriteria)

        text_tweets = [[tweet.id, tweet.permalink, tweet.username, tweet.retweets, 
                tweet.favorites, tweet.mentions, tweet.hashtags, tweet.geo, 
                tweet.date, tweet.text] for tweet in tweets]

        sample_tweets = pd.DataFrame(text_tweets, columns=columns)
        compiled_tweets_df = pd.concat([compiled_tweets_df, sample_tweets])
        print("Query between {} and {} complete!".format(date_range_string[i], date_range_string[i+1]))
        print("Pause the operation...")
        time.sleep(2.5) # pause for 2.5 seconds to avoid server crashing
        print("Begin new iteration...")
        if i!=0 and i%30==0:
            iteration_end_time = time.perf_counter()
            print("Pause time per 30 iterations: {} seconds".format(str(round(iteration_end_time - iteration_time, 2))))
        
    return compiled_tweets_df # concatenate the list of dataframe into a single dataframe
    print("Operation complete!")
    print("Total Runtime: {} seconds!".format(str(round(iteration_end_time - start_time, 2))))
    

In [9]:
tweets = get_tweets("immigra", "2019-12-31", 13)

Start querying data...
Query between 2007-01-03 and 2007-01-04 complete!
Pause the operation...
Begin new iteration...
Query between 2007-01-04 and 2007-01-05 complete!
Pause the operation...
Begin new iteration...
Query between 2007-01-05 and 2007-01-06 complete!
Pause the operation...
Begin new iteration...
Query between 2007-01-06 and 2007-01-07 complete!
Pause the operation...
Begin new iteration...
Query between 2007-01-07 and 2007-01-08 complete!
Pause the operation...
Begin new iteration...
Query between 2007-01-08 and 2007-01-09 complete!
Pause the operation...
Begin new iteration...
Query between 2007-01-09 and 2007-01-10 complete!
Pause the operation...
Begin new iteration...
Query between 2007-01-10 and 2007-01-11 complete!
Pause the operation...
Begin new iteration...
Query between 2007-01-11 and 2007-01-12 complete!
Pause the operation...
Begin new iteration...
Query between 2007-01-12 and 2007-01-13 complete!
Pause the operation...
Begin new iteration...
Query between 200

KeyboardInterrupt: 

In [None]:
tweets["text"][0]

In [None]:
tweets.index = range(len(tweets))

In [None]:
tweets["text"][3]

In [None]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text) # remove url
    text = text.lower() # convert text to lower case
    text = re.sub(r'[^\x00-\x7F]+',' ', text) # replace the non-ASCII characters with space
    text = text.split()
    text = [i for i in text if not i.startswith("#")]
    text = ' '.join(text)
    text = re.sub(r'[^\w\s]', '', text) # remove punctuations

    return text
    

In [None]:
tweets["text"] = tweets["text"].apply(clean_text)

In [None]:
tweets["time"] = pd.to_datetime(tweets["time"])

In [None]:
tweets.head()

In [None]:
tweets["time"] = tweets["time"].dt.tz_localize(None)

In [None]:
tweets.to_excel("immigration_tweets_13_years.xlsx")

In [None]:
# tweets.to_csv("immigration_tweets_data_example.csv")