# Tweet # Counts

Time series, counts by week.


#### Libraries + DB

In [1]:
%run utilities.py
%run ../../src/secret_key.py

# for hashtag count summation
import numpy as np

import psycopg2 as pg
import pandas.io.sql as psql

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
## get connected to the database
conn_str = "dbname='immigration' user='rlrson' password={}".format(pw)
conn = pg.connect(conn_str)

#### Time Periods

Caglar has broken them down into before/after the 'epoch' date. Lets pull them in.

In [3]:
time_periods = pd.read_sql("SELECT * FROM timeinterval;", con=conn)
time_periods

Unnamed: 0,id,startinterval,endinterval
0,1,2016-12-27 10:00:00+00:00,2017-01-27 10:00:00+00:00
1,2,2017-01-28 10:00:00+00:00,2017-02-27 10:00:00+00:00
2,3,2017-01-21 10:00:00+00:00,2017-01-27 10:00:00+00:00
3,4,2017-01-28 10:00:00+00:00,2017-02-03 10:00:00+00:00
4,5,2017-01-26 10:00:00+00:00,2017-01-27 10:00:00+00:00
5,6,2017-01-28 10:00:00+00:00,2017-01-29 10:00:00+00:00


In [4]:
time_periods.dtypes

id                             int64
startinterval    datetime64[ns, UTC]
endinterval      datetime64[ns, UTC]
dtype: object

In [5]:
# set variables so we can specify a beginning/end for our SQL queries
start = time_periods.iloc[0]['startinterval'].tz_convert(None)
end = time_periods.iloc[1]['endinterval'].tz_convert(None)

#### Week Categorization

We need to create a mapping that will return the week number for us. We'll start at the epoch and work our way backwards week by week. After a month we'll do 'after' the epoch.

In [6]:
# set the time variables
# epoch_start - (delta_week * 4) >> Timestamp('2016-12-30 10:00:00')
# start >> Timestamp('2016-12-27 10:00:00+0000', tz='UTC')

# Epoch
epoch_start = pd.to_datetime('2017-01-27 10:00:00+00:00', utc=True).tz_convert(None)
epoch_end = pd.to_datetime('2017-01-28 10:00:00+00:00', utc=True).tz_convert(None)

# delta
delta_week = pd.Timedelta('7 days')

# before
four_before = epoch_start - (delta_week * 4)
three_before = epoch_start - (delta_week * 3)
two_before = epoch_start - (delta_week * 2)
one_before = epoch_start - (delta_week * 1)

# after
four_after = epoch_end + (delta_week * 4)
three_after = epoch_end + (delta_week * 3)
two_after = epoch_end + (delta_week * 2)
one_after = epoch_end + (delta_week * 1)


def get_week_number(week):
    try:
        week = pd.to_datetime(week, infer_datetime_format=True)
    
        # catch, before/after?
        if (week < start) or (week > end):
            return None

        if (week < four_before):
            return -5
        elif (week < three_before):
            return -4
        elif (week < two_before):
            return -3
        elif (week < one_before):
            return -2
        elif (week < epoch_start):
            return -1
        elif (week < epoch_end):
            return 0
        elif (week < one_after):
            return 1
        elif (week < two_after):
            return 2
        elif (week < three_after):
            return 3
        elif (week < four_after):
            return 4
        else:
            return 5
    except Exception as e:
        print (e, week)
        return None

#### Tweets

We need a count of tweets from every county, for every week (8: 4-before/after). The `tweet` table has tweetid, tweet, date, userid, id, lon, lat, cntyid.

In [7]:
query_test = "SELECT tweetID, date::date, tweet  FROM tweet LIMIT 10;"
test_df = pd.read_sql(query_test, con=conn)

test_df.head(10)

Unnamed: 0,tweetid,date,tweet
0,822960153469321217,2017-01-21,اللهم إني أعوذ بك من الهم والحزن والعجز والكسل...
1,822960154190692352,2017-01-21,سبحان الله وبحمده سبحان الله العظيم ♻️
2,822960155629416448,2017-01-21,RT : Massachusetts: Brandeis Univ. Hires Terro...
3,822960156795437057,2017-01-21,أستغفر الله العظيم وأتوب إليه ♻️
4,822960158976393216,2017-01-21,اللهم اهدني فيمن هديت وعافني فيمن عافيت وتولني...
5,822960161677606912,2017-01-21,RT : When you ask a muslim girl for nudes
6,822960162130563072,2017-01-21,أستغفر الله العظيم وأتوب إليه ♻️
7,822960162646454274,2017-01-21,RT : So many beautiful Muslim sisters at the ...
8,822960163095244801,2017-01-21,أذكار الأذان:اللهم رب هذه الدعوة التامة والصلا...
9,822960164345180160,2017-01-21,أذكار الأذان:اللهم رب هذه الدعوة التامة والصلا...


In [8]:
tweet_query = "SELECT date::date, tweet FROM tweet LIMIT 75000;"
tweet_df = pd.read_sql(tweet_query, con=conn)

tweet_df.head(2)

Unnamed: 0,date,tweet
0,2017-01-21,اللهم إني أعوذ بك من الهم والحزن والعجز والكسل...
1,2017-01-21,سبحان الله وبحمده سبحان الله العظيم ♻️


In [9]:
tweet_df.dtypes

date     object
tweet    object
dtype: object

### Hashtags

In [10]:
# https://stackoverflow.com/questions/6331497/an-elegant-way-to-get-hashtags-out-of-a-string-in-python
def get_hashtags(text):
    try:
        tags = set([item.strip("#.,-\"\'&*^!").lower() 
                    for item in text.split() 
                    if ((item.startswith("#") == True) 
                        and (item.startswith("...") == False) 
                        and (item.endswith("...") == False) 
                        and (item.startswith("…") == False) 
                        and (item.endswith("…") == False) 
                        and (item != '') 
                        and len(item) < 256)])

        if bool(tags):
            return list(tags)
        else:
            return None
    except Exception as e:
        print (e, text)
        return None

In [11]:
tweet_df['tags'] = tweet_df.tweet.apply(get_hashtags)

tweet_df.head()

Unnamed: 0,date,tweet,tags
0,2017-01-21,اللهم إني أعوذ بك من الهم والحزن والعجز والكسل...,
1,2017-01-21,سبحان الله وبحمده سبحان الله العظيم ♻️,
2,2017-01-21,RT : Massachusetts: Brandeis Univ. Hires Terro...,
3,2017-01-21,أستغفر الله العظيم وأتوب إليه ♻️,
4,2017-01-21,اللهم اهدني فيمن هديت وعافني فيمن عافيت وتولني...,


In [12]:
# filter the dataframe to only those tweets with hashtags
test = tweet_df[tweet_df['tags'].isnull() == False][['date', 'tags']]

In [13]:
test_day = test.groupby('date')
test_day.head()

Unnamed: 0,date,tags
87,2017-01-21,[212]
544,2017-01-21,"[bigot, lindasarsour, sharia, islam]"
692,2017-01-21,"[new, islam]"
712,2017-01-21,"[new, god, friend, islam, photo, christianity,..."
715,2017-01-21,"[egypt, jordan, bahrain, iraq, lebanon, syria]"
42750,2017-01-22,[1]
42811,2017-01-22,[212]
42813,2017-01-22,[startups]
42868,2017-01-22,[wudu]
42892,2017-01-22,"[bacon, quran, freethinker, desert, god, musli..."


#### Parallelize

This is going to take forever unless we do.

In [49]:
from multiprocessing import Pool  # faster
def parallelize_series(series, func):
    pool = Pool(6)

    df = pool.map(func, series)

    pool.close()
    pool.join()
    return df

def concat_month_tags(dataframe):
    """
    Takes a Pandas Dataframe[date, tags] and returns a concatentation of them all
    """
    return np.concatenate(dataframe.tags.values)

def count_date_tags(date, group):
    #
    concatted = concat_month_tags(group)
    count_concat = pd.value_counts(concatted)
    
    count_df = count_concat.reset_index()
    count_df.columns = ['tag', 'count']
    
    # filter
    #count_df = count_df[count_df['count'] > 5]
    count_df = count_df.sort_values(by=['count'], ascending=False)
    count_df = count_df.iloc[:30]
    
    # add date back in
    count_df['date'] = date
    
    return count_df

In [50]:
maybe = pd.DataFrame()
for name, group in test_day:
    date_counts = count_date_tags(name, group)
    maybe = maybe.append(date_counts)
    
maybe.head()

Unnamed: 0,tag,count,date
0,islam,43,2017-01-21
1,muslim,28,2017-01-21
2,,16,2017-01-21
3,212,16,2017-01-21
4,rt,15,2017-01-21


#### DataFrame Construction

Putting together everything we've done so far:
1. Read in the tweets
2. Calculate the hashtags variable column
3. Filter the dataframe to only those with hashtags
4. GroupBy the `date` column, and concatenate all of the hashtags within that day
5. Get value counts of the hashtags as a dataframe
6. add to the regular.

In [51]:
def count_tweets(tweets):
    # 2
    #tweets['tags'] = tweets['tweet'].apply(get_hashtags)
    tweets['tags'] = parallelize_series(tweets['tweet'].values, get_hashtags)
    
    # 3 
    tweets = tweets[tweets['tags'].isnull() == False]
    
    # 4
    by_day = tweets[['date', 'tags']].groupby('date')
    
    # 5
    counts = pd.DataFrame()
    for date, tags in by_day:
        date_counts = count_date_tags(date, tags)
        
        # 6
        counts = counts.append(date_counts)
    
    return counts

#### Actual Counts!!!

In [52]:
# for real this time
full_query = "SELECT date::date, tweet FROM tweet;"

day_counts = pd.DataFrame()
for chunk in pd.read_sql(full_query, con=conn, chunksize=50000):
    formatted = count_tweets(chunk)
    day_counts = day_counts.append(formatted)

In [53]:
day_counts.head()

Unnamed: 0,tag,count,date
0,islam,43,2017-01-22
1,isis,20,2017-01-22
2,muslim,17,2017-01-22
3,isil,15,2017-01-22
4,1,13,2017-01-22


In [54]:
final = day_counts.groupby(['date', 'tag']).agg(sum)
final = final.reset_index()
final.columns = ['date', 'tag', 'count']

final.head()

Unnamed: 0,date,tag,count
0,2016-12-22,,9
1,2016-12-22,1.0,1
2,2016-12-22,112.0,2
3,2016-12-22,118.0,1
4,2016-12-22,162.0,1


In [56]:
final['week'] = final.date.map(get_week_number)

In [57]:
final.head()

Unnamed: 0,date,tag,count,week
0,2016-12-22,,9,
1,2016-12-22,1.0,1,
2,2016-12-22,112.0,2,
3,2016-12-22,118.0,1,
4,2016-12-22,162.0,1,


In [67]:
# drop rows without weeks 
final = final[final['week'].isnull() == False]
# drop empty tags
final = final[final['tag'] != '']
# custom filters
final = final[final['tag'] != '1']
final = final[final['tag'] != 'rt']

In [69]:
top_tags = final[['tag', 'count']].groupby('tag').agg(sum).sort_values(by=['count'], ascending=False).head(25)
top_tags = top_tags.reset_index()
top_tags.head()

Unnamed: 0,tag,count
0,islam,18200
1,california,5668
2,sydney,5587
3,arabic,5584
4,uganda,5582


In [72]:
top_tags_l = set(top_tags.tag.tolist())

In [77]:
# Calculate a variable for inclusion
temp = final
temp['top'] = temp['tag'].apply(lambda x: x in top_tags_l)

# create a dataframe from only the top tweets
final_filt = temp[temp['top'] == True][['date', 'tag', 'count', 'week']]
print (len(final_filt))
final_filt.head()

1101


Unnamed: 0,date,tag,count,week
218,2016-12-28,arabic,34,-5.0
220,2016-12-28,california,34,-5.0
226,2016-12-28,followback,7,-5.0
228,2016-12-28,isis,21,-5.0
229,2016-12-28,islam,104,-5.0


In [78]:
# finally write these out
f = name_file_path('hashtag-counts-date.csv', processed_web_dir)
final.to_csv(f, index=False)

f_filt = name_file_path('hashtag-counts-date-filtered.csv', processed_web_dir)
final_filt.to_csv(f_filt, index=False)

f_tags = name_file_path('hashtag-counts.csv', processed_web_dir)
top_tags.to_csv(f_tags, index=False)

---

In [79]:
conn.close()

In [80]:
# Wide cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

# number of text characters per column
pd.set_option('max_colwidth', 140)