# Tweet # Counts

Time series, counts by week.


#### Libraries + DB

In [1]:
%run utilities.py
%run ../../src/secret_key.py

# for hashtag count summation
import numpy as np

import psycopg2 as pg
import pandas.io.sql as psql

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
## get connected to the database
conn_str = "dbname='immigration' user='rlrson' password={}".format(pw)
conn = pg.connect(conn_str)

#### Time Periods

Caglar has broken them down into before/after the 'epoch' date. Lets pull them in.

In [3]:
time_periods = pd.read_sql("SELECT * FROM timeinterval;", con=conn)
time_periods

Unnamed: 0,id,startinterval,endinterval
0,1,2016-12-27 10:00:00+00:00,2017-01-27 10:00:00+00:00
1,2,2017-01-28 10:00:00+00:00,2017-02-27 10:00:00+00:00
2,3,2017-01-21 10:00:00+00:00,2017-01-27 10:00:00+00:00
3,4,2017-01-28 10:00:00+00:00,2017-02-03 10:00:00+00:00
4,5,2017-01-26 10:00:00+00:00,2017-01-27 10:00:00+00:00
5,6,2017-01-28 10:00:00+00:00,2017-01-29 10:00:00+00:00


In [4]:
time_periods.dtypes

id                             int64
startinterval    datetime64[ns, UTC]
endinterval      datetime64[ns, UTC]
dtype: object

In [5]:
# set variables so we can specify a beginning/end for our SQL queries
start = time_periods.iloc[0]['startinterval'].tz_convert(None)
end = time_periods.iloc[1]['endinterval'].tz_convert(None)

#### Week Categorization

We need to create a mapping that will return the week number for us. We'll start at the epoch and work our way backwards week by week. After a month we'll do 'after' the epoch.

In [6]:
# set the time variables
# epoch_start - (delta_week * 4) >> Timestamp('2016-12-30 10:00:00')
# start >> Timestamp('2016-12-27 10:00:00+0000', tz='UTC')

# Epoch
epoch_start = pd.to_datetime('2017-01-27 10:00:00+00:00', utc=True).tz_convert(None)
epoch_end = pd.to_datetime('2017-01-28 10:00:00+00:00', utc=True).tz_convert(None)

# delta
delta_week = pd.Timedelta('7 days')

# before
four_before = epoch_start - (delta_week * 4)
three_before = epoch_start - (delta_week * 3)
two_before = epoch_start - (delta_week * 2)
one_before = epoch_start - (delta_week * 1)

# after
four_after = epoch_end + (delta_week * 4)
three_after = epoch_end + (delta_week * 3)
two_after = epoch_end + (delta_week * 2)
one_after = epoch_end + (delta_week * 1)


def get_week_number(week):
    try:
        week = pd.to_datetime(week, infer_datetime_format=True)
    
        # catch, before/after?
        if (week < start) or (week > end):
            return None

        if (week < four_before):
            return -5
        elif (week < three_before):
            return -4
        elif (week < two_before):
            return -3
        elif (week < one_before):
            return -2
        elif (week < epoch_start):
            return -1
        elif (week < epoch_end):
            return 0
        elif (week < one_after):
            return 1
        elif (week < two_after):
            return 2
        elif (week < three_after):
            return 3
        elif (week < four_after):
            return 4
        else:
            return 5
    except Exception as e:
        print (e, week)
        return None

#### Tweets

We need a count of tweets from every county, for every week (8: 4-before/after). The `tweet` table has tweetid, tweet, date, userid, id, lon, lat, cntyid.

In [7]:
query_test = "SELECT tweetID, date::date, tweet  FROM tweet LIMIT 10;"
test_df = pd.read_sql(query_test, con=conn)

test_df.head(10)

Unnamed: 0,tweetid,date,tweet
0,812090242366701568,2016-12-22,Obama threw a wrench in Trump's plans for a Mu...
1,812090289800232961,2016-12-22,RT : Ministro alemán:”debemos deportar a cient...
2,812090333085466624,2016-12-22,Obama just made it harder for Trump to create ...
3,812090374722355200,2016-12-22,RT : Legal Group: Obama Dismantling US Regist...
4,812090414928953348,2016-12-22,Ormas Islam Berencana Bentuk Front Anti Narkoba
5,812090456569954304,2016-12-22,ربِّ أسألك خير ما في هذا اليوم وخير ما بعده ♻️
6,812090495547670530,2016-12-22,اللهم لا سهل إلا ما جعلته سهلاً وأنت تجعل الحز...
7,812090532755345409,2016-12-22,اللهم لا سهل إلا ما جعلته سهلاً وأنت تجعل الحز...
8,812090571888193536,2016-12-22,أذكار الأذان:يدعو لنفسه بين الأذان والإقامة فإ...
9,812090611914440705,2016-12-22,Bayu Hermawan: Ormas Islam Berencana Bentuk Fr...


In [22]:
tweet_query = "SELECT date::date, tweet FROM tweet LIMIT 75000;"
tweet_df = pd.read_sql(tweet_query, con=conn)

tweet_df.head(2)

Unnamed: 0,date,tweet
0,2016-12-24,يا حي يا قيوم برحمتك أستغيث أصلح لي شأني كُله ♻️
1,2016-12-24,اللهم قني عذابك يوم تبعث عبادك ♻️


### Keywords

In [10]:
# https://stackoverflow.com/questions/6331497/an-elegant-way-to-get-hashtags-out-of-a-string-in-python
def get_keyword(text, keyword):
    try:
        return 1 if keyword in text else 0
    except Exception as e:
        print (e, text)
        return None
    
def get_adj(text):
    return get_keyword(text, 'immigrant')
def get_verb(text):
    return get_keyword(text, 'immigration')

In [14]:
tweet_df['immigrant'] = parallelize_series(tweet_df.tweet.values, get_adj)
tweet_df['immigration'] = parallelize_series(tweet_df.tweet.values, get_verb)
tweet_df['cnt'] = 1

tweet_df.head()

Unnamed: 0,date,tweet,adj,verb,cnt
0,2016-12-22,Obama threw a wrench in Trump's plans for a Mu...,0,0,1
1,2016-12-22,RT : Ministro alemán:”debemos deportar a cient...,0,0,1
2,2016-12-22,Obama just made it harder for Trump to create ...,0,0,1
3,2016-12-22,RT : Legal Group: Obama Dismantling US Regist...,0,0,1
4,2016-12-22,Ormas Islam Berencana Bentuk Front Anti Narkoba,0,0,1


In [16]:
test_day = tweet_df[['date', 'adj', 'verb', 'cnt']].groupby('date').agg(sum)
test_day.head()

Unnamed: 0_level_0,adj,verb,cnt
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-12-22,159,173,5106
2016-12-23,733,917,50434
2016-12-24,170,286,19349
2016-12-25,0,0,19
2016-12-26,0,0,32


#### Parallelize

This is going to take forever unless we do.

In [19]:
from multiprocessing import Pool  # faster
def parallelize_series(series, func):
    pool = Pool(6)

    df = pool.map(func, series)

    pool.close()
    pool.join()
    return df

#### DataFrame Construction

Putting together everything we've done so far:
1. Read in the tweets
2. Add three variables, contains(immigrant), contains(immigration), 1 (for total #)
3. groupby date, sum. 
4. reset index to make it a normal DF
5. return DF

In [29]:
def count_tweets(tweets):
    tweets['immigrant'] = parallelize_series(tweets.tweet.values, get_adj)
    tweets['immigration'] = parallelize_series(tweets.tweet.values, get_verb)
    tweets['total'] = 1

    count_df = tweets[['date', 'immigrant', 'immigration', 'total']].groupby('date').agg(sum)
    count_df.reset_index(inplace=True)
    
    return count_df

count_tweets(tweet_df)

Unnamed: 0,date,immigrant,immigration,total
0,2016-12-24,152,144,20106
1,2016-12-25,235,235,30896
2,2016-12-26,20,46,3450
3,2016-12-27,176,515,20540
4,2016-12-28,0,0,8


#### Actual Counts!!!

In [30]:
# for real this time
full_query = "SELECT date::date, tweet FROM tweet;"

day_counts = pd.DataFrame()
for chunk in pd.read_sql(full_query, con=conn, chunksize=50000):
    formatted = count_tweets(chunk)
    day_counts = day_counts.append(formatted)

In [31]:
day_counts.head()

Unnamed: 0,date,immigrant,immigration,total
0,2016-12-30,122,255,13871
1,2016-12-31,198,335,22794
2,2017-01-01,61,186,13311
3,2017-01-02,0,1,24
0,2017-01-01,127,418,19571


In [33]:
final = day_counts.groupby(['date']).agg(sum)
final = final.reset_index()

final.head()

Unnamed: 0,date,immigrant,immigration,total
0,2016-12-22,159,173,5106
1,2016-12-23,733,917,50434
2,2016-12-24,318,426,39156
3,2016-12-25,235,235,30915
4,2016-12-26,20,46,3482


In [40]:
final['week'] = final.date.map(get_week_number)
final.head()

Unnamed: 0,date,immigrant,immigration,total,week
0,2016-12-22,159,173,5106,
1,2016-12-23,733,917,50434,
2,2016-12-24,318,426,39156,
3,2016-12-25,235,235,30915,
4,2016-12-26,20,46,3482,


In [41]:
# drop rows without weeks 
final = final[final['week'].isnull() == False]

In [42]:
# finally write these out
f = name_file_path('term-date-counts.csv', processed_web_dir)
final.to_csv(f, index=False)

---

In [43]:
conn.close()

In [44]:
# Wide cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

# number of text characters per column
pd.set_option('max_colwidth', 140)