In [13]:
import numpy as np
import pandas as pd
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
import csv

%matplotlib inline

In [23]:
from os import listdir
from os.path import isfile, join
path = '../RetweetDataAOAS/retweet_data/'
root_tweet_names = [f for f in listdir(path) if isfile(join(path, f))]
num_tweets = len(root_tweet_names)

In [24]:
# Produces a dictionary of dataframes for each tweetfile, with initial 
# preprocessing
fields = ['RetweetCount', 'UserId', 'ScreenName', 'FollowerCount', 
          'DistanceFromRoot','Time', 'ParentScreenName', 'Text']
tweet_dfs = []
for i in range(num_tweets):
    tweet_df = pd.read_csv(path+root_tweet_names[i], sep="\t", header=None, 
                         quoting=csv.QUOTE_NONE, names=fields, encoding = "ISO-8859-1")
    
    tweet_df['Time'] = pd.to_datetime(tweet_df['Time'])

    screen_name_index = {}
    for index, row in tweet_df.iterrows():
        screen_name_index[row['ScreenName']] = index
    tweet_df['ParentDfIndex'] = tweet_df['ParentScreenName'].map(screen_name_index)
    
    tweet_dfs.append(tweet_df)

In [25]:
[len(i) for i in tweet_dfs]

[37,
 74,
 74,
 161,
 304,
 101,
 48,
 108,
 135,
 44,
 97,
 92,
 31,
 22,
 107,
 127,
 731,
 176,
 70,
 462,
 167,
 611,
 127,
 769,
 164,
 85,
 212,
 370,
 533,
 345,
 68,
 101,
 26,
 256,
 1134,
 506,
 271,
 28,
 118,
 60,
 24,
 288,
 92,
 1261,
 203,
 31,
 189,
 90,
 136,
 306,
 608,
 68]

In [18]:
# Returns a dictionary of reaction times S_j^x keyed by user id
def generate_reaction_times(tweet_df):
    reaction_times = {}
    for index, row in tweet_df.iterrows():
        if index > 0:
            reaction_time = row['Time'] - tweet_df.at[row['ParentDfIndex'],
                                                      'Time']
            reaction_times[row['UserId']] = reaction_time
    return reaction_times

In [19]:
# Returns a dictionary of M_j^x keyed by user id
def generate_number_of_follower_who_retweet(tweet_df):
    number_of_follower_who_retweet = {}
    for index, row in tweet_df.iterrows():
        if row['UserId'] not in number_of_follower_who_retweet:
            number_of_follower_who_retweet[row['UserId']] = 0
        parent_user_id = tweet_df.at[row['ParentDfIndex'], 'UserId']
        number_of_follower_who_retweet[parent_user_id] += 1
    return number_of_follower_who_retweet

In [26]:
log_s_j_x = []
for i in range(num_tweets):
    s_j_x = sorted(generate_reaction_times(tweet_dfs[i]).values())
    log_s_j_x.append([np.log(i.seconds) for i in s_j_x])

In [None]:
# Scratch pymc3 code
with pm.Model() as twitter_model:
    # global model parameters
    # Time-related hyperparameters
    alpha = pm.Normal('alpha', mu=0, sd=100)
    sigma_squared_delta = pm.InverseGamma('sigma_squared_delta', alpha=0.5, beta=0.5)
    log_a_tau = pm.Normal('log_a_tau', mu=0, sd=10)
    b_tau = pm.Gamma('b_tau', alpha=1, beta=.002)
    
    # Graph-related hyperparameters
    sigma_squared_b = pm.InverseGamma('sigma_squared_b', alpha=0.5, beta=0.5)
    beta_0 = pm.Normal('beta_0', mu=0, tau=1/sigma_squared_b)
    beta_F = pm.Normal('beta_F', mu=0, tau=1/sigma_squared_b)
    beta_d = pm.Normal('beta_d', mu=0, tau=1/sigma_squared_b)
    
    # log-normal model for reaction times, nonrecursive...
    a_tau = np.exp(log_a_tau)
    tau_x_squared = pm.InverseGamma('tau_x_squared', alpha=a_tau, beta=b_tau, shape=num_tweets)
    alpha_x = pm.Normal('alpha_x', alpha=alpha, tau=1/sigma_squared_delta, shape=num_tweets)
    log_rx_times = pm.Normal('log_s_j_x', mu=alpha_x, tau=tau_x_squared, value=log_s_j_x, observed=True)
    
    
    
    
    
    