In [1]:
from twitterscraper import query_tweets
from datetime import timedelta, date
import pandas as pd
import csv

In [2]:
def scrape(query, limit_per_day, start_date, end_date):
    queries = query_tweets(query = query, 
                            limit = limit_per_day, 
                            begindate = start_date,
                            enddate = end_date, 
                            poolsize=20, lang='en')
    tweets = []
    for tweet in queries:
        tweets.append({'date': tweet.timestamp, 'text': tweet.text, 
                        'fullname': tweet.fullname, 'id': tweet.id, 
                        'likes': tweet.likes, 'replies': tweet.replies,
                        'retweets': tweet.retweets, 'url': tweet.url,
                        'user': tweet.user})
    df = pd.DataFrame(tweets)
    df['month'] = df['date'].apply(lambda x : date(x.year, x.month, 1))
    df = df.dropna()
    df = df.drop_duplicates()
    df = df[df['date'] >= start_date]
    df = df[df['date'] <= end_date]
    return df

In [3]:
def merge_and_clean(df_list):
    df = pd.concat(df_list)
    df = df.dropna()
    df = df.drop_duplicates()
    return df

In [4]:
odd_2016 = [(date(2015, 12, 31), date(2016, 2, 1)),
            (date(2016, 1, 31), date(2016, 3, 1)),
           (date(2016, 2, 29), date(2016, 4, 1)),
           (date(2016, 3, 31), date(2016, 5, 1)),
           (date(2016, 4, 30), date(2016, 6, 1)),
           (date(2016, 5, 31), date(2016, 7, 1)),
           (date(2016, 6, 30), date(2016, 8, 1)),
           (date(2016, 7, 31), date(2016, 9, 1)),
           (date(2016, 8, 31), date(2016, 10, 1)),
           (date(2016, 9, 30), date(2016, 11, 1)),
           (date(2016, 10, 31), date(2016, 12, 1)),
           (date(2016, 11, 30), date(2017, 1, 1))]

In [5]:
even_2016 = [(date(2016, 1, 1), date(2016, 2, 1)),
            (date(2016, 2, 1), date(2016, 3, 1)),
           (date(2016, 3, 1), date(2016, 4, 1)),
           (date(2016, 4, 1), date(2016, 5, 1)),
           (date(2016, 5, 1), date(2016, 6, 1)),
           (date(2016, 6, 1), date(2016, 7, 1)),
           (date(2016, 7, 1), date(2016, 8, 1)),
           (date(2016, 8, 1), date(2016, 9, 1)),
           (date(2016, 9, 1), date(2016, 10, 1)),
           (date(2016, 10, 1), date(2016, 11, 1)),
           (date(2016, 11, 1), date(2016, 12, 1)),
           (date(2016, 12, 1), date(2017, 1, 1))]

In [6]:
odd1 = odd_2016[:3]
odd2 = odd_2016[3:6]
odd3 = odd_2016[6:9]
odd4 = odd_2016[9:]
even1 = even_2016[:3]
even2 = even_2016[3:6]
even3 = even_2016[6:9]
even4 = even_2016[9:]

In [7]:
def go(date_range, query_term):
    df_list = []
    query = query_term
    limit_per_day = 50
    for tup in date_range:
        start_date = tup[0]
        end_date = tup[1]
        df = scrape(query, limit_per_day, start_date, end_date)
        df_list.append(df)
    bigdf = merge_and_clean(df_list)
    return bigdf

In [8]:
query_term = 'electoral college'
limit_per_day = 20

In [9]:
even3 = go(even3, query_term)

In [10]:
even4 = go(even4, query_term)

In [11]:
df = merge_and_clean([even3, even4])

In [12]:
#for date in df.sort_values('date').date.unique():
   #print(date)

In [13]:
min(df.date)

Timestamp('2016-07-02 22:03:19')

In [14]:
max(df.date)

Timestamp('2016-12-31 23:59:49')

In [15]:
df.to_csv('twitter_data/andrea_scraped/electoralcollege2016', mode='a', header=False)

In [16]:
#len(df)

In [17]:
df

Unnamed: 0,date,fullname,id,likes,replies,retweets,text,url,user,month
0,2016-07-22 23:58:57,Markham Robinson,756639703625936896,3,0,1,。@jslagra The real candidates in Presidential ...,/BraveLad/status/756639703625936896,BraveLad,2016-07-01
1,2016-07-22 23:57:55,aka David Dennison,756639444376190977,0,0,0,If Kaine is picked purely because of Virginia ...,/rowast/status/756639444376190977,rowast,2016-07-01
2,2016-07-22 23:55:10,Markham Robinson,756638750621339648,5,1,5,If the AIP & the CA Reps take 5 & 50 Electoral...,/BraveLad/status/756638750621339648,BraveLad,2016-07-01
3,2016-07-22 23:55:05,Axman27 🐉,756638731264819201,0,0,0,no...its the electoral college that decides a ...,/axman_27/status/756638731264819201,axman_27,2016-07-01
4,2016-07-22 23:53:56,butterball,756638440691605504,0,1,0,haha silly goose of course i know what the ele...,/teamBasilio/status/756638440691605504,teamBasilio,2016-07-01
5,2016-07-22 23:52:54,Markham Robinson,756638180011421696,5,0,2,The AIP can by agreement with the CA Republica...,/BraveLad/status/756638180011421696,BraveLad,2016-07-01
6,2016-07-22 23:52:22,Daniel Parsons,756638047505149952,3,0,0,My fav part of The Purge: Election Year is tha...,/dandanparsons/status/756638047505149952,dandanparsons,2016-07-01
7,2016-07-22 23:48:52,Sarah Reynolds,756637163316387840,2,1,2,"Google ""electoral college,"" learn math, and th...",/Sarah__Reynolds/status/756637163316387840,Sarah__Reynolds,2016-07-01
8,2016-07-22 23:45:07,James S Russell,756636223368630272,0,0,0,Clinton’s Lead in Electoral College is Falling...,/jamesswriter/status/756636223368630272,jamesswriter,2016-07-01
9,2016-07-22 23:44:35,Meshuggenah Nitabach,756636085707571200,0,1,0,"Totally agree with the ""let's do this rhetoric...",/mnitabach/status/756636085707571200,mnitabach,2016-07-01


In [18]:
df['text'].head()[4]

"haha silly goose of course i know what the electoral college is but that's not what I'm talking about. I'm done haha"

In [19]:
len(df)

1848