In [17]:
import numpy as np
import scipy as sc
import pandas as pd
import pymc3 as pm
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import itertools
from os import listdir
from os.path import isfile, join
import xlrd
%matplotlib inline

In [40]:
path = '../RetweetDataAOAS/retweet_data/'
root_tweet_names = [f for f in listdir(path) if isfile(join(path, f))]
# Produces a dictionary of dataframes for each tweetfile, with initial 
# preprocessing
fields = ['RetweetCount', 'UserId', 'ScreenName', 'FollowerCount', 
          'DistanceFromRoot','Time', 'ParentScreenName', 'Text']
tweet_dfs = {}
for i in range(len(root_tweet_names)):
    tweet_df = pd.read_csv(path+root_tweet_names[i], sep="\t", header=None, 
                         quoting=csv.QUOTE_NONE, names=fields, encoding = "ISO-8859-1")
    
    tweet_df['Time'] = pd.to_datetime(tweet_df['Time'])

    screen_name_index = {}
    for index, row in tweet_df.iterrows():
        screen_name_index[row['ScreenName']] = index
    tweet_df['ParentDfIndex'] = tweet_df['ParentScreenName'].map(screen_name_index)
    tweet_df[['FollowerCount','UserId']] = tweet_df[['FollowerCount','UserId']].apply(pd.to_numeric, errors='coerce')
    tweet_df[['FollowerCount','UserId']] = tweet_df[['FollowerCount','UserId']].fillna(0)
    tweet_dfs[i] = tweet_df

In [38]:
tweet_name_to_index = {}
for i in range(len(root_tweet_names)):
    tweet_name_to_index[root_tweet_names[i]] = i
for key in tweet_name_to_index.keys():
    print key

Source_followers_tweet_0137_Therealkiss.txt
Source_followers_tweet_0108_newtgingrich.txt
Source_followers_tweet_0533_BarackObama.txt
Source_followers_tweet_0349_KimKardashian.txt
Source_followers_tweet_0785_KimKardashian.txt
Source_followers_tweet_0069_bobatl.txt
Source_followers_tweet_0101_myfabolouslife.txt
Source_followers_tweet_0024_pbsgwen.txt
Source_followers_tweet_0176_AnnCoulter.txt
Source_followers_tweet_1134_JLin7.txt
Source_followers_tweet_0608_rickyrozay.txt
Source_followers_tweet_0294_myfabolouslife.txt
Source_followers_tweet_0462_AnnDRomney.txt
Source_followers_tweet_0506_billmaher.txt
Source_followers_tweet_0611_BarackObama.txt
Source_followers_tweet_0189_sethmeyers21.txt
Source_followers_tweet_0167_rickyrozay.txt
Source_followers_tweet_0031_CNET.txt
Source_followers_tweet_0044_JonnyBones.txt
Source_followers_tweet_0127_hilaryr.txt
Source_followers_tweet_0068_CharlesMBlow.txt
Source_followers_tweet_0026_realMickFoley.txt
Source_followers_tweet_0127_newtgingrich.txt
Sourc

In [42]:
def format_partition_file_name(name):
    root = name.split('.')
    items = root[0].split('_')
    items[-2], items[-1] = items[-1], items[-2]
    return ".".join(["_".join(items), root[-1]])

path = '../RetweetDataAOAS/Partition/'
partition_names = [f for f in listdir(path) if isfile(join(path, f))]
partitions = {}
# for i in range(len(partition_names)):
for i in range(2, 3):
    partitions[i] = {}
    partition_df = pd.read_excel(path+partition_names[i], encoding = "ISO-8859-1")
    for index, row in partition_df.iterrows():
        training_file_name = format_partition_file_name(row['Training'])
        prediction_file_name = format_partition_file_name(row['Prediction'])
        partitions[i][tweet_name_to_index[training_file_name]] = True
        partitions[i][tweet_name_to_index[prediction_file_name]] = False
    print partitions[i]
    

KeyError: u'Source_followers_tweet_0068_bobatl.txt'

In [22]:
# Returns a dictionary of reaction times S_j^x keyed by user id
def generate_reaction_times(tweet_df):
    reaction_times = []
    for index, row in tweet_df.iterrows():
        if index > 0:
            reaction_time = row['Time'] - tweet_df.at[row['ParentDfIndex'],
                                                      'Time']
            reaction_times.append(reaction_time)
    return reaction_times

In [8]:
# Returns a dictionary of M_j^x keyed by user id
def generate_number_of_follower_who_retweet(tweet_df):
    number_of_follower_who_retweet = {}
    for index, row in tweet_df.iterrows():
        if row['UserId'] not in number_of_follower_who_retweet:
            number_of_follower_who_retweet[row['UserId']] = 0
        parent_user_id = tweet_df.at[row['ParentDfIndex'], 'UserId']
        number_of_follower_who_retweet[parent_user_id] += 1
    return number_of_follower_who_retweet

In [None]:
tweet_df_1 = tweet_dfs[1]
s_j_x = sorted(generate_reaction_times(tweet_df_1).values())
log_s_j_x = [np.log(i.seconds) for i in s_j_x]
tweet_df_1_users = list(tweet_df_1['UserId'])
m_j_x_dic = generate_number_of_follower_who_retweet(tweet_df_1)

In [9]:
 def f(x, y):
    return x*y
def multiply(elements):
    return reduce(f, elements)



In [38]:
# Scratch pymc3 code
with pm.Model() as tweet_model:
    # global model parameters
    alpha = pm.Normal('alpha', mu=0, sd=100)
    sigma_squared_delta = pm.InverseGamma('sigma_squared_delta', alpha=2, beta=2)
    log_a_tau = pm.Normal('log_a_tau', mu=0, sd=10)
    b_tau = pm.Gamma('b_tau', alpha=1, beta=0.002)
#     beta_0 = pm.Normal('beta_0', mu=0, sd=100)
#     beta_f = pm.Normal('beta_f', mu=0, sd=100)
#     beta_d = pm.Normal('beta_d', mu=0, sd=100)
#     sigma_squared_b = pm.InverseGamma('sigma_squared_b', alpha=0.5, beta=0.5)
    a_tau = pm.math.exp(log_a_tau)
    
    # tweet specific parameters, keyed by tweet x
    tau_squared_dic = {}
    alpha_dic = {}
    log_s_probability_dic = {}
    m_probability_dic = {}
    logit_b_dic = {}
    likelihood_training_tweets = {}
    likelihood_prediction_tweets = {}
    
    num_root_tweets = len(tweet_dfs)
    num_root_tweets = 2
    for x in range(num_root_tweets):
        tweet_df = tweet_dfs[x]
        s = generate_reaction_times(tweet_df)
        log_s = [np.log(i.seconds) for i in s]        
        tweet_df_users = list(tweet_df['UserId'])
        m_dic = generate_number_of_follower_who_retweet(tweet_df)

        # log-normal model for reaction times
        tau_squared_dic[x] = pm.InverseGamma('tau_{}_squared'.format(x), alpha=a_tau, beta=b_tau)
        alpha_dic[x] = pm.Normal('alpha_{}'.format(x), mu=alpha, tau=1/sigma_squared_delta)
        
        log_s_probability_dic[x] = {}
        for j in range(1, len(tweet_df_users)):
            log_s_probability_dic[x][j] = pm.Normal('log_s_{}_{}'.format(x,j), mu=alpha_dic[x],
                                                tau=1/tau_squared_dic[x], observed=log_s[j-1])
        
        # binomial model for retweet graph structure
        logit_b_dic[x] = {}
        m_probability_dic[x] = {}        
        for j, row in tweet_df.iterrows():
            user = tweet_df_users[j]
            f_j_x = int(row['FollowerCount'])
            d_j_x = row['DistanceFromRoot']
            mu_j_x = pm.Deterministic('mu_{}_{}'.format(x,j), beta_0 + beta_f * np.log(f_j_x + 1) + beta_d * np.log(d_j_x + 1))
            logit_b_dic[x][j] = pm.Normal('logit_b_{}_{}'.format(x,j), mu=mu_j_x, tau=1/sigma_squared_b)            
            b_j_x = pm.Deterministic('b_{}_{}'.format(x,j), pm.math.x(logit_b_dic[x][j]))
            m_probability_dic[x][j] = pm.Binomial('m_{}_{}'.format(x, j), n=f_j_x, p=b_j_x,
                                             observed=m_dic[user])
        
        # TODO - use partitions to check if a tweet is used for training or prediction
        # if training
        likelihood_training_tweets[x] = m_probability_dic[x][0]
        for j in range(1, len(tweet_df_users)):
            likelihood_training_tweets[x] = likelihood_training_tweets[x] * log_s_probability_dic[x][
                                            j] * m_probability_dic[x][j]        
        if prediction
    def f(x, y):
        return x*y
    def multiply(elements):
        return reduce(f, elements)
    
    phi = [alpha, sigma_squared_delta, log_a_tau, b_tau, beta_0, beta_f, beta_d, sigma_squared_b]
    p_phi = pm.Deterministic('p_phi', multiply(phi))
    # multiplying along x
    p_alpha = pm.Deterministic('p_alpha', reduce(f, alpha_dic.values()))
    # multiplying along x
    p_tau = pm.Deterministic('p_tau', reduce(f, tau_squared_dic.values()))
    # multiplying along x,j
    m_to_collapse = [m_x.values() for m_x in m_probability_dic.values()]
    collapsed_m = list(itertools.chain(*m_to_collapse))
    p_m = pm.Deterministic('p_m', reduce(f, collapsed_m))
    # multiplying along x,j
    b_to_collapse = [b_x.values() for b_x in logit_b_dic.values()]
    collapsed_b = list(itertools.chain(*b_to_collapse))
    p_b = pm.Deterministic('p_b', reduce(f, collapsed_b))
    # multiplying along trained tweets (currently x)
    p_train_tweets = pm.Deterministic('p_train_tweets', reduce(f, likelihood_training_tweets.values()))
    # multiplying along prediction tweets (currently x)
    # not implemented yet
    p_prediction_tweets = pm.Deterministic('p_prediction_tweets', reduce(lambda x, y: x*y, likelihood_prediction_tweets.values()))
    p_list = [p_phi, p_alpha, p_tau, p_m, p_b, p_train_tweets, p_prediction_tweets]
    posterior = pm.Deterministic('posterior', reduce(lam))
    

In [None]:
with tweet_model:
    trace = pm.sample(1000, tune=2000, cores=4)