In [1]:
from twitterscraper import query_tweets
from datetime import timedelta, date
import pandas as pd
import csv

In [2]:
def scrape(query, limit_per_day, start_date, end_date):
    queries = query_tweets(query = query, 
                            limit = limit_per_day, 
                            begindate = start_date,
                            enddate = end_date, 
                            poolsize=20, lang='en')
    tweets = []
    for tweet in queries:
        tweets.append({'date': tweet.timestamp, 'text': tweet.text, 
                        'fullname': tweet.fullname, 'id': tweet.id, 
                        'likes': tweet.likes, 'replies': tweet.replies,
                        'retweets': tweet.retweets, 'url': tweet.url,
                        'user': tweet.user})
    df = pd.DataFrame(tweets)
    df['month'] = df['date'].apply(lambda x : date(x.year, x.month, 1))
    df = df.dropna()
    df = df.drop_duplicates()
    df = df[df['date'] >= start_date]
    df = df[df['date'] <= end_date]
    return df

In [3]:
def merge_and_clean(df_list):
    df = pd.concat(df_list)
    df = df.dropna()
    df = df.drop_duplicates()
    return df

In [4]:
odd_2016 = [(date(2015, 12, 31), date(2016, 2, 1)),
            (date(2016, 1, 31), date(2016, 3, 1)),
           (date(2016, 2, 29), date(2016, 4, 1)),
           (date(2016, 3, 31), date(2016, 5, 1)),
           (date(2016, 4, 30), date(2016, 6, 1)),
           (date(2016, 5, 31), date(2016, 7, 1)),
           (date(2016, 6, 30), date(2016, 8, 1)),
           (date(2016, 7, 31), date(2016, 9, 1)),
           (date(2016, 8, 31), date(2016, 10, 1)),
           (date(2016, 9, 30), date(2016, 11, 1)),
           (date(2016, 10, 31), date(2016, 12, 1)),
           (date(2016, 11, 30), date(2017, 1, 1))]

In [5]:
even_2016 = [(date(2016, 1, 1), date(2016, 2, 1)),
            (date(2016, 2, 1), date(2016, 3, 1)),
           (date(2016, 3, 1), date(2016, 4, 1)),
           (date(2016, 4, 1), date(2016, 5, 1)),
           (date(2016, 5, 1), date(2016, 6, 1)),
           (date(2016, 6, 1), date(2016, 7, 1)),
           (date(2016, 7, 1), date(2016, 8, 1)),
           (date(2016, 8, 1), date(2016, 9, 1)),
           (date(2016, 9, 1), date(2016, 10, 1)),
           (date(2016, 10, 1), date(2016, 11, 1)),
           (date(2016, 11, 1), date(2016, 12, 1)),
           (date(2016, 12, 1), date(2017, 1, 1))]

In [6]:
odd1 = odd_2016[:3]
odd2 = odd_2016[3:6]
odd3 = odd_2016[6:9]
odd4 = odd_2016[9:]
even1 = even_2016[:3]
even2 = even_2016[3:6]
even3 = even_2016[6:9]
even4 = even_2016[9:]

In [7]:
def go(date_range, query_term):
    df_list = []
    query = query_term
    limit_per_day = 50
    for tup in date_range:
        start_date = tup[0]
        end_date = tup[1]
        df = scrape(query, limit_per_day, start_date, end_date)
        df_list.append(df)
    bigdf = merge_and_clean(df_list)
    return bigdf

In [8]:
query_term = 'benghazi'
limit_per_day = 50

In [9]:
even3_benghazi = go(even3, query_term)

In [10]:
even4_benghazi = go(even4, query_term)

In [11]:
#min(even1_benghazi.date)

In [12]:
#max(even1_benghazi.date)

In [13]:
#min(odd4_benghazi.date)

In [14]:
#max(odd4_benghazi.date)

In [15]:
df = merge_and_clean([even3_benghazi, even4_benghazi])

In [16]:
#for date in df.sort_values('date').date.unique():
   #print(date)

In [17]:
min(df.date)

Timestamp('2016-07-01 19:32:39')

In [18]:
max(df.date)

Timestamp('2016-12-31 23:59:55')

In [19]:
df.to_csv('twitter_data/benghazi2016', mode='a', header=False)

In [20]:
#len(df)

In [21]:
df

Unnamed: 0,date,fullname,id,likes,replies,retweets,text,url,user,month
1,2016-07-02 23:58:58,John Orta,749391950575312896,0,0,0,Benghazi chair Gowdy in GOP address: “We now k...,/JohnOrta4/status/749391950575312896,JohnOrta4,2016-07-01
2,2016-07-02 23:58:55,Dr Stuart Bramhall,749391938957111297,0,0,0,Benghazi Scandal Report--Hillary Was Part of M...,/stuartbramhall/status/749391938957111297,stuartbramhall,2016-07-01
3,2016-07-02 23:58:38,BP,749391865737347072,0,0,0,Here's the KEY MOMENT in Benghazi hearings - R...,/b2646p/status/749391865737347072,b2646p,2016-07-01
4,2016-07-02 23:58:36,🇺🇸BR-549 🇺🇸,749391857805717504,0,0,0,Do you have your #BenghaziReport yet?\nYou can...,/USA___Forever/status/749391857805717504,USA___Forever,2016-07-01
5,2016-07-02 23:58:32,#impeachT***P,749391839124332544,1,0,0,"So clear of Benghazi, FBI finds no wrong doing...",/patysq73/status/749391839124332544,patysq73,2016-07-01
6,2016-07-02 23:58:20,PapaESoCo,749391790977921025,0,1,0,Michael Savage: 'Trey Gowdy Should Be Impeache...,/PapaESoCo/status/749391790977921025,PapaESoCo,2016-07-01
7,2016-07-02 23:57:56,sean hannity buzz,749391691279265792,0,0,1,Sean Hannity Gets Humiliated After Pushing Abs...,/cinnews365/status/749391691279265792,cinnews365,2016-07-01
8,2016-07-02 23:57:52,Liberty & Freedom,749391674955161600,0,0,0,"Bloomberg Radio calls Benghazi a ""Dead Issue"",...",/Oahts/status/749391674955161600,Oahts,2016-07-01
9,2016-07-02 23:57:52,Covfefe007,749391670907539456,0,0,0,TRUMP CAN NEVER WIN ITS ALREADY RIGGED KILLARY...,/Nancy007f/status/749391670907539456,Nancy007f,2016-07-01
10,2016-07-02 23:57:45,Sammy4Trump,749391644810555392,1,0,1,I don't know how Obama finds these creeps.Susa...,/tomsam974/status/749391644810555392,tomsam974,2016-07-01


In [22]:
df['text'].head()[1]

'Benghazi chair Gowdy in GOP address: “We now know the full story” http://www.mcclatchydc.com/news/politics-government/congress/article87352957.html\xa0…'

In [23]:
len(df)

1817