In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
#from wordcloud import WordCloud
import pandas_profiling
import missingno as msno
import datetime
import re

## This file shows how we validate hypothesis 3:
### Time for positive/negative/anger/anxiety/sad tweets drop to half peak volume is longer in 2013 than 2012.

feel free to contact me(yl3963@columbia.edu) if you have any questions


## prepare dataset

In [7]:
d2013=pd.read_csv('d2013.csv')

In [8]:
d2013.head()

Unnamed: 0.1,Unnamed: 0,date_posted_gmt,message,date,year,month,day,week,days_from_event
0,96948,2013-01-01 03:47:00,rip to trayvon,2013-01-01 03:47:00,2013,1,1,1,-193
1,96949,2013-01-01 03:59:52,reflecting on 2012: sybrina fulton speaks on h...,2013-01-01 03:59:52,2013,1,1,1,-193
2,96950,2013-01-01 04:40:15,"bringing in 2013, we won't forget trayvon mart...",2013-01-01 04:40:15,2013,1,1,1,-193
3,96951,2013-01-01 04:42:07,i liked a @youtube video from @hoodnews http:/...,2013-01-01 04:42:07,2013,1,1,1,-193
4,96952,2013-01-01 04:45:19,got my hoodie on to cuh !ã¢â¬å@cuztatted got...,2013-01-01 04:45:19,2013,1,1,1,-193


####  function add_ymdw: add year month day  week columns for original dataframe 


In [9]:
def add_ymdw(df):
    df['date']=pd.to_datetime(df['date_posted_gmt'])

    df['year']=df['date'].apply(lambda x: x.year)

    df['month']=df['date'].apply(lambda x: x.month)

    df['day']=df['date'].apply(lambda x: x.day)
    
    def getweek(a,b,c):
        return datetime.date(a,b,c).isocalendar()[1]
    df['week'] = df.apply(lambda x: getweek(x['year'], x['month'],x['day']), axis=1)

    return df

In [15]:
d2013=add_ymdw(d2013)
# d2013

### split dataset according to their year
you can get 3 dataframes finally

In [16]:
d2013['message'] = d2013['message'].str.lower()

In [11]:
file = open("LIWC Dictionary/LIWC_dic.txt","r")

LIWC = file.readlines()
LIWC_dict = {}
for line in LIWC:
    line = line.split(',')
    category = line[0]
    words = line[1:]
    words[0] = words[0][1:]
    words[-1] = words[-1][:-2]
    LIWC_dict[category] = words

In [12]:
posemo = LIWC_dict['Posemo']
for i in range(len(posemo)):
    if posemo[i][-1] == '*':
        posemo[i] = r'\b' + posemo[i][:-1]
    else:
        posemo[i] = r'\b' + posemo[i] + r'\b'
posemo[0] = '\(:'
posemo[5] = ':\)'
reg_posemo = "|".join(posemo)
reg_posemo = re.compile(reg_posemo)

negemo = LIWC_dict['Negemo']
for i in range(len(negemo)):
    if negemo[i][-1] == '*':
        negemo[i] = r'\b' + negemo[i][:-1]
    else:
        negemo[i] = r'\b' + negemo[i] + r'\b'
negemo[0] = '\):'
negemo[5] = ':\('
reg_negemo = "|".join(negemo)
reg_negemo = re.compile(reg_negemo)

anx = LIWC_dict['Anx']
for i in range(len(anx)):
    if anx[i][-1] == '*':
        anx[i] = r'\b' + anx[i][:-1]
    else:
        anx[i] = r'\b' + anx[i] + r'\b'
reg_anx = "|".join(anx)
reg_anx = re.compile(reg_anx)

anger = LIWC_dict['Anger']
for i in range(len(anger)):
    if anger[i][-1] == '*':
        anger[i] = r'\b' + anger[i][:-1]
    else:
        anger[i] = r'\b' + anger[i] + r'\b'
reg_anger = "|".join(anger)
reg_anger = re.compile(reg_anger)

sad = LIWC_dict['Sad']
for i in range(len(sad)):
    if sad[i][-1] == '*':
        sad[i] = r'\b' + sad[i][:-1]
    else:
        sad[i] = r'\b' + sad[i] + r'\b'
reg_sad = "|".join(sad)
reg_sad = re.compile(reg_sad)

stress = LIWC_dict['Stress']
for i in range(len(stress)):
    if stress[i][-1] == '*':
        stress[i] = r'\b' + stress[i][:-1]
    else:
        stress[i] = r'\b' + stress[i] + r'\b'
reg_stress = "|".join(stress)
reg_stress = re.compile(reg_stress)

In [46]:
def check_category(s,category,dic):
    '''
    s type: string
    category type: list of string 
    '''
    if category == 'Posemo':
        if reg_posemo.search(s):
            return category
    elif category == 'Negemo':
        if reg_negemo.search(s):
            return category
    elif category == 'Anx':
        if reg_anx.search(s):
            return category
    elif category == 'Anger':
        if reg_anger.search(s):
            return category
    elif category == 'Sad':
        if reg_sad.search(s):
            return category
    elif category == 'Stress':
        if reg_stress.search(s):
            return category
    else:
        print("only valid for six category")
    return 'NA'

In [47]:
check_category('stressed','sad',LIWC_dict)

only valid for six category


'NA'

### add_tag_col
- df:  dataframe 
- cate: emotion category
- dic: the dictionary stores all words belong to the category
e.g. here tag='anx' means the message of the row contains anxiety words

In [156]:
def add_tag_col(df,cate,dic):
    for category in cate:
        df[category]=df.apply(lambda x: check_category(x['message'],category,dic),axis=1)
    return df

In [None]:
def find_half_peak_time_diff_2013(df,cate):
    
    def find_peak_time_2013(df,cate):
        return int(df[cate].values.argmax())
    
    peaktime=find_peak_time_2013(df,cate)
    
    def find_peak_value_2013(df,cate):
        return float(df.iloc[find_peak_time_2013(df,cate)][cate])
    
    peakval=find_peak_value_2013(df,cate)
    half=peakval/2
    
    for week in range(peaktime+1,172):
        if df.iloc[week][cate]<=half:
            for week2 in range(week+1,172):
                if df.iloc[week2][cate]<=half:
                     return abs(week-peaktime)
    return 0

In [157]:
def form_table(df,cate):
    df_cate_day=df_sample[(df_sample[cate]==cate) &(df_sample['days_from_event']>0)].groupby('days_from_event')['message'].count()
    all_res=df_sample[df_sample['days_from_event']>0].groupby('days_from_event')['message'].count()

    catelist=[]
    for day,val in df_cate_day.items():
        catelist.append((day,val))
    #     catelist
    alltweetlist=[]
    for day,val in all_res.items():
        alltweetlist.append((day,val))
    # alltweetlist
    final0=[]
    for i in range(0,172):
        for day1,catenum in catelist:
            if day1!=i:
                final0.append((i,0))
                continue
            for day2,tweetnum in alltweetlist:
                if day1==day2:
                    final0.append((day1,100*catenum/tweetnum))
                    continue

    final0=list(set(final0))
    final0.sort()

    final1={}
    for a,b in final0:
        if a not in final1:
            final1[a]=b
        else:
            final1[a]+=b

    cate_tb=pd.DataFrame(list(final1.items()), columns=['day', cate])
    return cate_tb

In [196]:
# cate='Anx'
# emtions=[cate]
# df=d2013.sample(n=20000)
# df=add_tag_col(df,emotions,LIWC_dict) 
# res=form_table(df,cate)
# plt.figure(figsize=(20,8))
# # gca stands for 'get current axis'
# ax = plt.gca()
# plt.plot(res['Anx'])

In [186]:
def bootstrap2013(df,cate,epoch):
    
    emotion=[]
    cate='Sad'
    for epoch in range(epoch):
        df_sample=df.sample(n=30000)
        emotions = [cate]
        df_sample=add_tag_col(df_sample,emotions,LIWC_dict) 
        df_sample=form_table(df_sample,cate)
        peak_diff=find_half_peak_time_diff_2013(df_sample,cate)
        emotion.append(peak_diff)
        if epoch%5==0:
            print(epoch,peak_diff)
    return emotion,sum(emotion)/len(emotion)

## sad in 2013

In [187]:
bootstrap2013(d2013,'Sad',30)

0 1
5 1
10 1
15 1
20 1
25 1


([1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 1.0)

## stress in 2013

In [188]:
bootstrap2013(d2013,'Stress',30)

0 1
5 1
10 1
15 1
20 1
25 1


([1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 1.0)

### anger in 2013

In [189]:
bootstrap2013(d2013,'Anger',30)

0 1
5 1
10 1
15 1
20 1
25 1


([1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 1.0)

### Anxiety in 2013

In [190]:
bootstrap2013(d2013,'Anxiety',30)

0 1
5 1
10 1
15 1
20 1
25 1


([1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 1.0)

### posemo in 2013

In [191]:
bootstrap2013(d2013,'Posemo',30)

0 1
5 1
10 1
15 1
20 1
25 1


([1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 1.0)

### negemo in 2013

In [192]:
bootstrap2013(d2013,'Negemo',30)

0 1
5 1
10 1
15 1
20 1
25 1


([1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 1.0)