In [1]:
from twitterscraper import query_tweets
from datetime import timedelta, date
import pandas as pd
import csv

In [2]:
def scrape(query, limit_per_day, start_date, end_date):
    queries = query_tweets(query = query, 
                            limit = limit_per_day, 
                            begindate = start_date,
                            enddate = end_date, 
                            poolsize=20, lang='en')
    tweets = []
    for tweet in queries:
        tweets.append({'date': tweet.timestamp, 'text': tweet.text, 
                        'fullname': tweet.fullname, 'id': tweet.id, 
                        'likes': tweet.likes, 'replies': tweet.replies,
                        'retweets': tweet.retweets, 'url': tweet.url,
                        'user': tweet.user})
    df = pd.DataFrame(tweets)
    df['month'] = df['date'].apply(lambda x : date(x.year, x.month, 1))
    df = df.dropna()
    df = df.drop_duplicates()
    df = df[df['date'] >= start_date]
    df = df[df['date'] <= end_date]
    return df

In [3]:
def merge_and_clean(df_list):
    df = pd.concat(df_list)
    df = df.dropna()
    df = df.drop_duplicates()
    return df

In [4]:
odd_2016 = [(date(2015, 12, 31), date(2016, 2, 1)),
            (date(2016, 1, 31), date(2016, 3, 1)),
           (date(2016, 2, 29), date(2016, 4, 1)),
           (date(2016, 3, 31), date(2016, 5, 1)),
           (date(2016, 4, 30), date(2016, 6, 1)),
           (date(2016, 5, 31), date(2016, 7, 1)),
           (date(2016, 6, 30), date(2016, 8, 1)),
           (date(2016, 7, 31), date(2016, 9, 1)),
           (date(2016, 8, 31), date(2016, 10, 1)),
           (date(2016, 9, 30), date(2016, 11, 1)),
           (date(2016, 10, 31), date(2016, 12, 1)),
           (date(2016, 11, 30), date(2017, 1, 1))]

odd_2017 = [(date(2016, 12, 31), date(2017, 2, 1)),
            (date(2017, 1, 31), date(2017, 3, 1)),
           (date(2017, 2, 28), date(2017, 4, 1)),
           (date(2017, 3, 31), date(2017, 5, 1)),
           (date(2017, 4, 30), date(2017, 6, 1)),
           (date(2017, 5, 31), date(2017, 7, 1)),
           (date(2017, 6, 30), date(2017, 8, 1)),
           (date(2017, 7, 31), date(2017, 9, 1)),
           (date(2017, 8, 31), date(2017, 10, 1)),
           (date(2017, 9, 30), date(2017, 11, 1)),
           (date(2017, 10, 31), date(2017, 12, 1)),
           (date(2017, 11, 30), date(2018, 1, 1))]

odd_2018 = [(date(2017, 12, 31), date(2018, 2, 1)),
            (date(2018, 1, 31), date(2018, 3, 1))]

In [5]:
even_2016 = [(date(2016, 1, 1), date(2016, 2, 1)),
            (date(2016, 2, 1), date(2016, 3, 1)),
           (date(2016, 3, 1), date(2016, 4, 1)),
           (date(2016, 4, 1), date(2016, 5, 1)),
           (date(2016, 5, 1), date(2016, 6, 1)),
           (date(2016, 6, 1), date(2016, 7, 1)),
           (date(2016, 7, 1), date(2016, 8, 1)),
           (date(2016, 8, 1), date(2016, 9, 1)),
           (date(2016, 9, 1), date(2016, 10, 1)),
           (date(2016, 10, 1), date(2016, 11, 1)),
           (date(2016, 11, 1), date(2016, 12, 1)),
           (date(2016, 12, 1), date(2017, 1, 1))]

even_2017 = [(date(2017, 1, 1), date(2017, 2, 1)),
            (date(2017, 2, 1), date(2017, 3, 1)),
           (date(2017, 3, 1), date(2017, 4, 1)),
           (date(2017, 4, 1), date(2017, 5, 1)),
           (date(2017, 5, 1), date(2017, 6, 1)),
           (date(2017, 6, 1), date(2017, 7, 1)),
           (date(2017, 7, 1), date(2017, 8, 1)),
           (date(2017, 8, 1), date(2017, 9, 1)),
           (date(2017, 9, 1), date(2017, 10, 1)),
           (date(2017, 10, 1), date(2017, 11, 1)),
           (date(2017, 11, 1), date(2017, 12, 1)),
           (date(2017, 12, 1), date(2018, 1, 1))]

even_2018 = [(date(2018, 1, 1), date(2018, 2, 1)),
            (date(2018, 2, 1), date(2018, 3, 1))]

In [6]:
odd1 = odd_2016[:3]
odd2 = odd_2016[3:6]
odd3 = odd_2016[6:9]
odd4 = odd_2016[9:]
even1 = even_2016[:3]
even2 = even_2016[3:6]
even3 = even_2016[6:9]
even4 = even_2016[9:]

odd1_2017 = odd_2017[:3]
odd2_2017 = odd_2017[3:6]
odd3_2017 = odd_2017[6:9]
odd4_2017 = odd_2017[9:]
even1_2017 = even_2017[:3]
even2_2017 = even_2017[3:6]
even3_2017 = even_2017[6:9]
even4_2017 = even_2017[9:]

In [7]:
def go(date_range, query_term):
    df_list = []
    query = query_term
    limit_per_day = 50
    for tup in date_range:
        start_date = tup[0]
        end_date = tup[1]
        df = scrape(query, limit_per_day, start_date, end_date)
        df_list.append(df)
    bigdf = merge_and_clean(df_list)
    return bigdf

In [8]:
query_term = 'prochoice'
limit_per_day = 50

In [9]:
even_2018 = go(even_2018, query_term)

In [10]:
odd_2018 = go(odd_2018, query_term)

In [11]:
df = merge_and_clean([even_2018, odd_2018])

In [12]:
#for date in df.sort_values('date').date.unique():
   #print(date)

In [13]:
min(df.date)

Timestamp('2018-01-01 18:28:22')

In [14]:
max(df.date)

Timestamp('2018-02-28 23:53:59')

In [15]:
df.to_csv('twitter_data/andrea_scraped/abortion2018', mode='a', header=False)

In [16]:
#len(df)

In [17]:
df.sort_values('date').date

117   2018-01-01 18:28:22
116   2018-01-01 18:58:17
115   2018-01-01 19:04:23
114   2018-01-01 19:08:38
113   2018-01-01 19:48:19
112   2018-01-01 20:27:52
111   2018-01-01 20:30:19
110   2018-01-01 20:32:26
109   2018-01-01 21:38:06
108   2018-01-01 21:41:23
107   2018-01-01 22:02:02
106   2018-01-01 22:08:19
105   2018-01-01 22:19:11
104   2018-01-01 23:16:20
103   2018-01-01 23:35:18
102   2018-01-01 23:36:45
101   2018-01-01 23:49:18
100   2018-01-01 23:55:00
99    2018-01-01 23:55:52
168   2018-01-02 21:12:39
167   2018-01-02 21:15:55
166   2018-01-02 21:23:09
165   2018-01-02 21:53:59
164   2018-01-02 21:55:04
163   2018-01-02 21:56:58
162   2018-01-02 21:58:30
161   2018-01-02 22:00:44
160   2018-01-02 22:06:52
159   2018-01-02 22:09:31
158   2018-01-02 22:09:58
              ...        
215   2018-02-27 22:46:01
214   2018-02-27 22:46:55
213   2018-02-27 22:53:02
212   2018-02-27 23:09:14
211   2018-02-27 23:37:35
210   2018-02-27 23:38:25
209   2018-02-27 23:49:58
208   2018-0

In [18]:
df['text'].head()[4]

'Yeah well I can’t think of any cases where a “person” is both putting your health in jeopardy—against your will—and also you have an obligation to preserve their life—also against your will.\nDare I ask… Can you?'

In [19]:
len(df)

1086