In [1]:
import GetOldTweets3 as got3
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
import re
import nltk
import time

In [13]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 350)

I will extract top tweets for the past 10 years. 100 top tweets will be extracted each month on the specified topic for past 10 years

In [21]:

def get_tweets(topic, end_date, years=13):
    columns = ['id', 'url', 'author', 'retweets', 'favorites', 'mentions', 'hashtags', 'geo', 'time', 'text']
    text_query = topic
    
    # convert the end_date string to the datetime object
    end_date_datetime = datetime.strptime(end_date, '%Y-%m-%d')

    # This section is to create a list of datetime objects that are 1 month apart going backwards
    date_range = [end_date_datetime]
    total_days = int(365 * years)
    
    prior_date = end_date_datetime
    
    while total_days > 0:
        prior_date = prior_date - relativedelta(days=1)
        date_range.append(prior_date)
        total_days -= 1
        
    # Convert the elements in the list from datetime objects to string
    date_range_string = [twitter_date.strftime('%Y-%m-%d') for twitter_date in date_range][::-1]
    print("Start querying data...")
    
    # Initialize an empty list to store dataframe from each iteration
    compiled_tweets_df = pd.DataFrame(columns=columns)
    
    
    for i in range(len(date_range_string)-1):
        tweetCriteria = got3.manager\
                .TweetCriteria()\
                .setQuerySearch(text_query)\
                .setLang('en')\
                .setSince(date_range_string[i])\
                .setUntil(date_range_string[i+1])\
                .setTopTweets(True)\
                .setMaxTweets(4)\
                .setEmoji("unicode")

        tweets = got3.manager.TweetManager.getTweets(tweetCriteria)

        text_tweets = [[tweet.id, tweet.permalink, tweet.username, tweet.retweets, 
                tweet.favorites, tweet.mentions, tweet.hashtags, tweet.geo, 
                tweet.date, tweet.text] for tweet in tweets]

        sample_tweets = pd.DataFrame(text_tweets, columns=columns)
        compiled_tweets_df = pd.concat([compiled_tweets_df, sample_tweets])
        print("Query between {} and {} complete!".format(date_range_string[i], date_range_string[i+1]))
        print("Pause the operation...")
        time.sleep(2.5) # pause for 2.5 seconds to avoid server crashing
        print("Begin new iteration...")
        
    return compiled_tweets_df # concatenate the list of dataframe into a single dataframe
    print("Operation complete!")
    

In [22]:
tweets = get_tweets("immigra", "2019-12-31", 0.1)

Start querying data...
Query between 2019-11-25 and 2019-11-26 complete!
Pause the operation...
Begin new iteration...
Query between 2019-11-26 and 2019-11-27 complete!
Pause the operation...
Begin new iteration...
Query between 2019-11-27 and 2019-11-28 complete!
Pause the operation...
Begin new iteration...
Query between 2019-11-28 and 2019-11-29 complete!
Pause the operation...
Begin new iteration...
Query between 2019-11-29 and 2019-11-30 complete!
Pause the operation...
Begin new iteration...
Query between 2019-11-30 and 2019-12-01 complete!
Pause the operation...
Begin new iteration...
Query between 2019-12-01 and 2019-12-02 complete!
Pause the operation...
Begin new iteration...
Query between 2019-12-02 and 2019-12-03 complete!
Pause the operation...
Begin new iteration...
Query between 2019-12-03 and 2019-12-04 complete!
Pause the operation...
Begin new iteration...
Query between 2019-12-04 and 2019-12-05 complete!
Pause the operation...
Begin new iteration...
Query between 201

In [24]:
tweets["text"][0]

0    Some Florida prison guards to be sworn as ICE ...
0    Issued by the U.S. Immigration and Customs Enf...
0    We're honored to be awarded Immigration Attorn...
0    We are grateful to assist those whose dream it...
0    Are you scheduled for an interview at a local ...
0    Millions of petitions & applications are delay...
0    Apparently democratic voters aren’t so sure. I...
0    Are you traveling this holiday season? If so, ...
0    Immigration Attorneys, LLP attorney Sara Barto...
0    There are certain required #naturalization ele...
0    This great country is based on freedom. With s...
0    “Stephen, thought you might like to see sample...
0    Today, we remember the battle and the brave pe...
0    We have four offices nationwide to serve you. ...
0    The story of your immigration journey matters ...
0    Immigration forms and requirements evolve quic...
0    Krista Eyler, an Immigration Attorneys, LLP at...
0    We're honored to be awarded Immigration Attorn...
0    Delay

In [25]:
tweets.index = range(len(tweets))

In [29]:
tweets["text"][3]

"Issued by the U.S. Immigration and Customs Enforcement's (ICE) and Removal Operations (ERO), these announcements detail enforcement activities and resulting apprehensions. Read more here: https://www.aila.org/infonet/ice-announcements-of-enforcement-actions #2019 #immatty #immigration #immigrationresults #citizenship"

In [30]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text) # remove url
    text = text.lower() # convert text to lower case
    text = text.split()
    text = [i for i in text if not i.startswith("#")]
    text = ' '.join(text)
    text = re.sub(r'[^\w\s]', '', text) # remove punctuations

    return text
    

In [31]:
tweets["text"] = tweets["text"].apply(clean_text)

In [37]:
tweets["time"] = pd.to_datetime(tweets["time"])

In [40]:
tweets.head()

Unnamed: 0,id,url,author,retweets,favorites,mentions,hashtags,geo,time,text
0,1199067809273147392,https://twitter.com/MiamiPlow/status/119906780...,MiamiPlow,0,0,,#Miami #FL,,2019-11-25 20:50:17+00:00,some florida prison guards to be sworn as ice ...
1,1199056441870618626,https://twitter.com/immigra_results/status/119...,immigra_results,1,1,,#immatty #immigration #immigrationresults #cit...,,2019-11-25 20:05:07+00:00,we are grateful to clients who tell us how we ...
2,1199038356954451968,https://twitter.com/RePEc_NEP_EUR/status/11990...,RePEc_NEP_EUR,0,0,,,,2019-11-25 18:53:16+00:00,does integration policy improve labour market ...
3,1199420095635832832,https://twitter.com/immigra_results/status/119...,immigra_results,0,0,,#immatty #immigration #immigrationresults #cit...,,2019-11-26 20:10:09+00:00,issued by the us immigration and customs enfor...
4,1199141541379497984,https://twitter.com/RePEc_NEP_MIG/status/11991...,RePEc_NEP_MIG,0,0,,,,2019-11-26 01:43:17+00:00,does integration policy improve labour market ...


In [44]:
tweets["time"][1].dt.tz_localize(None)

AttributeError: 'Timestamp' object has no attribute 'dt'

In [38]:
tweets.to_excel("demonstration.xlsx")

ValueError: Excel does not support datetimes with timezones. Please ensure that datetimes are timezone unaware before writing to Excel.

In [8]:
tweets.to_csv("immigration_tweets_data_example.csv")

In [20]:
tweets["text"][0]

'us rallies urge immigration reform protesters say bushs proposed overhaul of immigra'

In [19]:
pd.read_csv("immigration_tweets_data_example.csv").iloc[:, -1][0]

'us rallies urge immigration reform protesters say bushs proposed overhaul of immigra'