In [1]:
import numpy as np
import pandas as pd
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import math

%matplotlib inline



In [2]:
from os import listdir
from os.path import isfile, join
path = '../RetweetDataAOAS/retweet_data/'
root_tweet_names = [f for f in listdir(path) if isfile(join(path, f))]
num_tweets = len(root_tweet_names)

In [3]:
# Produces a dictionary of dataframes for each tweetfile, with initial 
# preprocessing
fields = ['RetweetCount', 'UserId', 'ScreenName', 'FollowerCount', 
          'DistanceFromRoot','Time', 'ParentScreenName', 'Text']
tweet_dfs = []
for i in range(num_tweets):
    tweet_df = pd.read_csv(path+root_tweet_names[i], sep="\t", header=None, 
                         quoting=csv.QUOTE_NONE, names=fields, encoding = "ISO-8859-1")
    
    tweet_df['Time'] = pd.to_datetime(tweet_df['Time'])
    tweet_df["UserId"] = pd.to_numeric(tweet_df["UserId"], errors = 'ignore')

    screen_name_index = {}
    for index, row in tweet_df.iterrows():
        screen_name_index[row['ScreenName']] = index
    tweet_df['ParentDfIndex'] = tweet_df['ParentScreenName'].map(screen_name_index)
    
    tweet_dfs.append(tweet_df)

In [4]:
# Returns a dictionary of reaction times S_j^x keyed by user id
# Also modify the df to include a ReactionTime column
# S_j^x = T_j^x - T_P(j)^x is the time of the user's tweet minus its parent

def generate_reaction_times(tweet_df):
    reaction_times = {}
    for index, row in tweet_df.iterrows():
#         if index > 0: # ignore root tweet
        reaction_time = row['Time'] - tweet_df.at[row['ParentDfIndex'],
                                                  'Time']
        reaction_times[row['UserId']] = reaction_time
        tweet_df.loc[index,"ReactionTime"] = reaction_time
    return reaction_times

In [5]:
# modifies the dataframe to include M_j^x
# M_j^x for tweet x and user j is the number of j's followers who retweet x

def generate_number_of_follower_who_retweet(tweet_df):
    number_of_follower_who_retweet = {}
    
    for index, row in tweet_df.iterrows():
        parent_user_id = tweet_df.at[row['ParentDfIndex'], 'UserId']
        if parent_user_id not in number_of_follower_who_retweet:
            number_of_follower_who_retweet[parent_user_id] =1
        else:
            number_of_follower_who_retweet[parent_user_id] += 1
    
    # add to dataFrame
    for index, row in tweet_df.iterrows():
        if row['UserId'] in number_of_follower_who_retweet:
            count = number_of_follower_who_retweet[row['UserId']]
            tweet_df.loc[index,"FollowersRetweeted"] = count
        else: 
            tweet_df.loc[index,"FollowersRetweeted"] = 0

    return number_of_follower_who_retweet

In [6]:
log_s_j_x = []
for i in range(num_tweets):
    s_j_x = sorted(generate_reaction_times(tweet_dfs[i]).values())
    generate_number_of_follower_who_retweet(tweet_dfs[i])
    log_s_j_x.append([np.log(i.seconds) for i in s_j_x])

In [37]:
# count number of unique users

t = tweet_dfs[0]['UserId']
for i in range(1,len(tweet_dfs)):
    t = pd.concat([t, tweet_dfs[i]['UserId']])
t.nunique()

11604

In [39]:
tweet_dfs[1].head(20)

Unnamed: 0,RetweetCount,UserId,ScreenName,FollowerCount,DistanceFromRoot,Time,ParentScreenName,Text,ParentDfIndex,ReactionTime,FollowersRetweeted
0,0,15097615,hilaryr,40041,0,2012-04-12 03:05:42,hilaryr,oh and @AnnDRomney welcome to Twitter. You wi...,0,00:00:00,79.0
1,1,236026761,DylanByers,4332,1,2012-04-12 03:05:50,hilaryr,RT @hilaryr: oh and @AnnDRomney welcome to Twi...,0,00:00:08,0.0
2,2,21316253,ZekeJMiller,12002,1,2012-04-12 03:06:04,hilaryr,RT @hilaryr: oh and @AnnDRomney welcome to Twi...,0,00:00:22,0.0
3,3,21234528,EmilyABC,4070,1,2012-04-12 03:06:10,hilaryr,! RT @hilaryr: oh and @AnnDRomney welcome to T...,0,00:00:28,2.0
4,4,18430529,COWBOYSDODGERS,110,1,2012-04-12 03:06:42,hilaryr,RT @hilaryr: oh and @AnnDRomney welcome to Twi...,0,00:01:00,0.0
5,5,19024627,joshgreenman,3221,1,2012-04-12 03:06:48,hilaryr,RT @hilaryr: oh and @AnnDRomney welcome to Twi...,0,00:01:06,0.0
6,6,19954098,carpdd,289,1,2012-04-12 03:07:19,hilaryr,RT @hilaryr: oh and @AnnDRomney welcome to Twi...,0,00:01:37,0.0
7,7,442649949,WillDarrell,22,1,2012-04-12 03:07:36,hilaryr,RT @hilaryr: oh and @AnnDRomney welcome to Twi...,0,00:01:54,0.0
8,8,19523851,janoid,396,1,2012-04-12 03:08:17,hilaryr,With friends like you...RT @hilaryr oh and @An...,0,00:02:35,0.0
9,9,28628850,greggutfeld,76996,1,2012-04-12 03:09:02,hilaryr,oh that's precious! RT @hilaryr oh and @AnnDRo...,0,00:03:20,34.0


In [29]:
# pymc3 model 
with pm.Model() as twitter_model:
    # global model parameters
    # Time-related hyperparameters
    alpha = pm.Normal('alpha', mu=0, sd=100)
    sigma_squared_delta = pm.InverseGamma('sigma_squared_delta', alpha=2, beta=2)
    log_a_tau = pm.Normal('log_a_tau', mu=0, sd=10)
    b_tau = pm.Gamma('b_tau', alpha=1, beta=.002)
    
    # Graph-related hyperparameters
    # binom model for graph structure
    sigma_squared_b = pm.InverseGamma('sigma_squared_b', alpha=0.5, beta=0.5, testval=10000)
    beta_0 = pm.Normal('beta_0', mu=0, sd=100)
    beta_F = pm.Normal('beta_F', mu=0, sd=100)
    beta_d = pm.Normal('beta_d', mu=0, sd=100)
    
    # log-normal model for reaction times, nonrecursive...
    a_tau = pm.Deterministic('a_tau', pm.math.exp(log_a_tau))
    for i in range(num_root_tweets):
        t_x = pm.InverseGamma('tau_x_squared_' + str(i), alpha=a_tau, beta=b_tau)
        a_x = pm.Normal('alpha_x_' + str(i), mu=alpha, tau=1/sigma_squared_delta)        
        l_x = pm.Normal('log_s_j_x_' + str(i), mu=a_x, tau=t_x**0.5, observed=log_s_j_x[i])
        
        # binom model for user
        for j, row in tweet_dfs[i].iterrows():
        f_j_x = row['FollowerCount']
        d_j_x = row['DistanceFromRoot']
        b_j_x = beta_0 + beta_F*math.log(f_j_x + 1) + beta_d*math.log()
        mu_j = beta_0 + beta_f*

ValueError: setting an array element with a sequence.