## Get Tweets

In [1]:
from twitterscraper import query_tweets
from datetime import timedelta, date
import pandas as pd
import csv
%matplotlib inline

In [2]:
def twitter_query_over_time(query, limit_per_day, start_date, end_date):
    
    # create dates between start_date and end_date
    dates = [start_date + timedelta(days = x) for x in range((end_date - start_date).days + 1)]
    
    queries = []    
    # enumerate through all pairs of dates until the second last day/last day pair
    for i, date in enumerate(dates[:-1]):
        query = query_tweets(query = query, limit = limit_per_day, 
                             begindate = dates[i], enddate = dates[i+1], 
                             poolsize=20, lang='en')
        queries.extend(query)
    
    return queries

### Query

In [None]:
query = 'releasethememo'
limit_per_day = 100
start_date = date(2018, 1, 1)
end_date = date(2017, 3, 1)

queries = twitter_query_over_time(query, limit_per_day, start_date, end_date)

In [None]:
tweets = []
for tweet in queries:
    tweets.append({'date': tweet.timestamp, 'text': tweet.text, 
                   'fullname': tweet.fullname, 'id': tweet.id, 
                   'likes': tweet.likes, 'replies': tweet.replies,
                   'retweets': tweet.retweets, 'url': tweet.url,
                   'user': tweet.user})

## Pre-process Tweets

In [None]:
df = pd.DataFrame(tweets)

In [None]:
len(df)

In [None]:
# add timestamp column
df['month'] = df['date'].apply(lambda x : date(x.year, x.month, 1))

In [None]:
# drop NAs
df = df.dropna()

In [None]:
# drop duplicates
df = df.drop_duplicates()

In [None]:
# drop values outside of queried range
df = df[df['date'] >= start_date]
df = df[df['date'] <= end_date]

In [None]:
len(df)

### Check that tweets look normal, ready for export

In [None]:
df.sort_values('date').head()

In [None]:
df.groupby('date')['text'].count().cumsum().plot()

In [None]:
df.to_csv('twitter_data/scraped_tweets')