In [2]:
from twitterscraper import query_tweets
from datetime import timedelta, date
import pandas as pd
import csv
%matplotlib inline

In [None]:
# query
query = 'Hillary'
limit_per_day = 10
start_date = date(2016, 6, 1)
end_date = date(2018, 1, 1)

In [None]:
queries = twitter_query_over_time(query, limit_per_day, start_date, end_date)
tweets = extract_tweets(queries)
df = format_tweets_as_df(tweets, start_date, end_date)

# check that tweets look normal, ready for export
df.groupby('date')['text'].count().cumsum().plot()

In [None]:
# export
df.to_csv('twitter_data/scraped_tweets')

In [3]:
def twitter_query_over_time(query, limit_per_day, start_date, end_date):
    # create dates between start_date and end_date
    dates = [start_date + timedelta(days = x) for x in range((end_date - start_date).days + 1)]
    
    queries = []    
    # enumerate through all pairs of dates until the second last day/last day pair
    for i, date in enumerate(dates[:-1]):
        query = query_tweets(query = query, limit = limit_per_day, 
                             begindate = dates[i], enddate = dates[i+1], 
                             poolsize=20, lang='en')
        queries.extend(query)
    return queries

In [None]:
def extract_tweets(queries):
    queries = twitter_query_over_time(query, limit_per_day, start_date, end_date)
    tweets = []
    for tweet in queries:
        tweets.append({'date': tweet.timestamp, 'text': tweet.text, 
                       'fullname': tweet.fullname, 'id': tweet.id, 
                       'likes': tweet.likes, 'replies': tweet.replies,
                       'retweets': tweet.retweets, 'url': tweet.url,
                       'user': tweet.user})
    return tweets

In [None]:
def format_tweets_as_df(tweets, start_date, end_date):
    df = pd.DataFrame(tweets)
    
    # add timestamp column
    df['month'] = df['date'].apply(lambda x : date(x.year, x.month, 1))
    
    # drop NAs
    df = df.dropna()

    # drop duplicates
    df = df.drop_duplicates()
    
    # drop values outside of queried range
    df = df[df['date'] >= start_date]
    df = df[df['date'] <= end_date]
    
    return df