In [29]:
import numpy as np
import pandas as pd
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
import csv
%matplotlib inline

In [30]:
from os import listdir
from os.path import isfile, join
path = '../RetweetDataAOAS/retweet_data/'
root_tweet_names = [f for f in listdir(path) if isfile(join(path, f))]

In [109]:
# Produces a dictionary of dataframes for each tweetfile, with initial preprocessing
fields = ['RetweetCount', 'UserId', 'ScreenName', 'FollowerCount', 
          'DistanceFromRoot','Time', 'ParentScreenName', 'Text']
tweet_dfs = {}
for i in range(len(root_tweet_names)):
    tweet_df = pd.read_csv(path+root_tweet_names[i], sep="\t", header=None, 
                         quoting=csv.QUOTE_NONE, names=fields)
    
    tweet_df['Time'] = pd.to_datetime(tweet_df['Time'])

    screen_name_index = {}
    for index, row in tweet_df.iterrows():
        screen_name_index[row['ScreenName']] = index
    tweet_df['ParentDfIndex'] = tweet_df['ParentScreenName'].map(screen_name_index)
    
    tweet_dfs[i] = tweet_df

In [88]:
tweet_dfs[1]

Unnamed: 0,RetweetCount,UserId,ScreenName,FollowerCount,DistanceFromRoot,Time,ParentScreenName,Text,ParentDfIndex
0,0,164412502,pbsgwen,23673,0,2012-04-13 17:01:00,pbsgwen,Cory Booker has never worked a day in his life...,0
1,1,13367172,shani_o,2030,1,2012-04-13 17:01:13,pbsgwen,Dying RT @pbsgwen: Cory Booker has never worke...,0
2,2,185747652,edenWXIA,928,1,2012-04-13 17:01:36,pbsgwen,RT @pbsgwen: Cory Booker has never worked a da...,0
3,3,21728303,keithboykin,8048,1,2012-04-13 17:02:28,pbsgwen,So the Ann Romney & Cory Bookers tweets have c...,0
4,4,26589498,drugmonkeyblog,1987,1,2012-04-13 17:03:40,pbsgwen,RT @pbsgwen: Cory Booker has never worked a da...,0
5,5,499894426,neivet2,23,1,2012-04-13 17:04:14,pbsgwen,RT @pbsgwen: Cory Booker has never worked a da...,0
6,6,442649949,WillDarrell,22,1,2012-04-13 17:04:22,pbsgwen,RT @pbsgwen: Cory Booker has never worked a da...,0
7,7,477450920,AdamSmithereens,11,1,2012-04-13 17:04:37,pbsgwen,RT @pbsgwen: Cory Booker has never worked a da...,0
8,8,18397364,KyleYounger,592,1,2012-04-13 17:05:40,pbsgwen,RT @pbsgwen: Cory Booker has never worked a da...,0
9,9,309190582,JustCouch,306,1,2012-04-13 17:06:42,pbsgwen,RT @pbsgwen: Cory Booker has never worked a da...,0


In [92]:
# Returns a dictionary of reaction times S_j^x keyed by user id
def generate_reaction_times(tweet_df):
    reaction_times = {}
    for index, row in tweet_df.iterrows():
        reaction_time = row['Time'] - tweet_df.at[row['ParentDfIndex'], 'Time']
        reaction_times[row['UserId']] = reaction_time
    return reaction_times

In [108]:
# Returns a dictionary of M_j^x keyed by user id
def generate_number_of_follower_who_retweet(tweet_df):
    number_of_follower_who_retweet = {}
    for index, row in tweet_df.iterrows():
        if row['UserId'] not in number_of_follower_who_retweet:
            number_of_follower_who_retweet[row['UserId']] = 0
        parent_user_id = tweet_df.at[row['ParentDfIndex'], 'UserId']
        number_of_follower_who_retweet[parent_user_id] += 1
    return number_of_follower_who_retweet

In [None]:
# Scratch pymc3 code
with pm.Model as model:
    # global model parameters
    alpha = pm.Normal('alpha', mu=0, sd=100)
    sigma_squared_delta = pm.InverseGamma('sigma_squared_delta', alpha=0.5, beta=0.5)
    log_a_tau = pm.Normal('log_a_tau', mu=0, sd=10)
#     b_tau = pm.Gamma('b_tau', )
    beta_0 = pm.Normal('beta_0', mu=0, sd=100)
    beta_F = pm.Normal('beta_F', mu=0, sd=100)
    beta_d = pm.Normal('beta_d', mu=0, sd=100)
    sigma_squared_b = pm.InverseGamma('sigma_squared_b', alpha=0.5, beta=0.5)