In [None]:
import pandas as pd
import numpy as np
import junoutils
import json

In [None]:
def calculate_transition_matrix_fast(df, states=None):

    ca = df
    
    if states == None:
        # figure out the unique states for the transition matrix
        states = sorted(list(set(list(ca.origin.unique()) + list(ca.destination.unique()))))
        
    n_states = len(states)

    gb = ca.groupby(['origin', 'destination'], as_index=False).count()
    gb['count'] = gb['created']
    gb = gb[['origin','destination','count']]

    index_dict = {}

    for i, state in enumerate(states):
        index_dict[state] = i

    gb['org_index'] = gb.origin.apply(lambda x: index_dict[x])
    gb['dest_index'] = gb.origin.apply(lambda x: index_dict[x])

    tm = np.zeros((n_states, n_states))

    def update_tm(row):

        tm[int(row['org_index']),int(row['dest_index'])] = int(row['count'])

    gb.apply(update_tm, axis=1)

    return tm


def calculate_comparative_user_matrix(email, df, states, transition_matrix):
    
    udf = df[df.email == email]
    
    utm = calculate_transition_maxtrix(udf, states)
    
    comparative_matrix = np.nan_to_num(utm/transition_matrix)
    
    rowsums = np.apply_along_axis(arr=comparative_matrix, axis=1, func1d=np.sum).reshape(-1,1).repeat(len(states), axis=1)
    
    comparative_matrix = comparative_matrix/rowsums
    
    return comparative_matrix

def transition_summary(df):
    
    # get rid of login auth as a destination because it doesn't make sense. Login-auth should be the entry point to a session
#     df = df[(df['destination'].str.contains('login') & (df['destination'].str.contains('auth'))) == False]
    
    gb = df.groupby(['email','origin','destination'], as_index=False).count()
    gb['user_count'] = gb['created']
    gb = gb[['email','origin','destination','user_count']]
    gb = gb.dropna()

    total_user_activity_counts = pd.DataFrame(gb.groupby(['email'])['user_count'].sum())
    total_user_activity_counts = total_user_activity_counts.reset_index()
    total_user_activity_counts['total_user_count'] = total_user_activity_counts['user_count']
    total_user_activity_counts = total_user_activity_counts[['email','total_user_count']]
    total_user_activity_counts = total_user_activity_counts.dropna()

    transition_data = gb.set_index('email').join(total_user_activity_counts.set_index('email'))

    gb = df.groupby(['origin','destination'], as_index=False).count()
    gb['alluser_transition_count'] = gb['created']
    gb = gb[['origin','destination','alluser_transition_count']]
    gb = gb.dropna()
    gb.set_index(['origin','destination'])

    transition_summary = transition_data.reset_index().set_index(['origin','destination']).join(gb.set_index(['origin','destination'])).reset_index()
    transition_summary['all_user_count'] = df.shape[0]
    transition_summary['user_proportion'] = transition_summary['user_count']/transition_summary['total_user_count']
    transition_summary['alluser_proportion'] = transition_summary['alluser_transition_count']/transition_summary['all_user_count']

    transition_summary['relative_proportion'] = transition_summary['user_proportion']/transition_summary['alluser_proportion']

    return transition_summary

def state_summary(df):
    
    all_users = pd.DataFrame(df.groupby(['destination'])['created'].count()).reset_index()
    all_users['alluser_state_count'] = all_users['created']
    all_users = all_users.drop('created', axis=1)
    all_users['alluser_state_proportion'] = all_users['alluser_state_count']/np.sum(all_users['alluser_state_count'])
        
    users = pd.DataFrame(df.groupby(['email','destination'])['created'].count()).reset_index()
    users['user_state_count'] = users['created']
    users = users.drop('created', axis=1)
    
    ugb  = pd.DataFrame(users.groupby(['email'])['user_state_count'].sum()).reset_index()
    ugb['user_allstate_count'] = ugb['user_state_count']
    ugb = ugb.drop('user_state_count', axis=1)
    
    summary = users.set_index('email').join(ugb.set_index('email'))
    
    summary = summary.reset_index().set_index('destination').join(all_users.set_index('destination')).reset_index()
    
    summary['user_state_proportion'] = summary['user_state_count']/summary['user_allstate_count']
    summary['relative_proportion'] = summary['user_state_proportion']/summary['alluser_state_proportion']
    summary['state'] = summary['destination']
    summary = summary.drop('destination', axis=1)
    
    return summary

def user_node_json(email, df):
    
    df = df[df.email == email].drop('email', axis=1)
    
    dict_rec = df.to_dict(orient='records')
    
    return json.dumps(dict_rec)
    
def user_link_json(email, df):
    
    df = df[df.email == email].drop('email', axis=1)
    
    dict_rec = df.to_dict(orient='records')
    
    return json.dumps(dict_rec)

In [None]:
d = pd.read_csv('data/events_raw_20180305091800.csv')

In [None]:
d['email'] = junoutils.combineColumnsByPrefix(df=d, prefixlist=['email'])
d[d['eventLabel'] == 'bitcoin']['eventLabel'] = 'BTC'
d['ca'] = d.eventCategory + "_" + d.eventAction
d['cl'] = d.eventCategory + "_" + d.eventLabel
d['cla'] = d.eventCategory + "_" + d.eventLabel + "_" + d.eventAction
s = d[['created','email','ca','cla','cl']]
s = s[(s.email == '') == False]
s = s[pd.isnull(s.email) == False]
s = s.dropna(how='all', subset=['email'])
s = s.sort_values(by='created', ascending=True)
s['ca_lag_1'] = s.groupby(by='email')['ca'].shift(1)
s['cla_lag_1'] = s.groupby(by='email')['cla'].shift(1)

cla = s[['created','email','cla_lag_1','cla']].dropna()
cla.sort_values(by=['email','created'])
cla['origin'] = cla['cla_lag_1']
cla['destination'] = cla['cla']
cla = cla.drop(columns=['cla_lag_1','cla'])

ca = s[['created','email','ca_lag_1','ca']].dropna()
ca.sort_values(by=['email','created'])
ca['origin'] = ca['ca_lag_1']
ca['destination'] = ca['ca']
ca = ca.drop(columns=['ca_lag_1','ca'])

cla_transition_summary = transition_summary(cla)
ca_transition_summary = transition_summary(ca)

cla_state_summary = state_summary(cla)
ca_state_summary = state_summary(ca)

In [None]:
import junoutils
import sklearn

subset = ca_transition_summary[['email','origin','destination','user_count','user_proportion','relative_proportion']]
subset['org_dest'] = subset['origin'] +'_to_'+subset['destination']
subset = subset.drop(['origin','destination'],axis=1)
subset = subset.melt(id_vars=['email','org_dest'], value_name='value', value_vars=['user_count','user_proportion','relative_proportion'])
subset['variable'] = subset['org_dest'] + '_'+ subset['variable']
subset = subset.drop('org_dest', axis=1)
subset = subset.set_index(['email','variable'])['value'].unstack(fill_value=0).reset_index()
subset = pd.DataFrame(columns=list(subset.columns), data=subset.values)
subset

subset_values = subset.drop('email', axis=1)
scaled_values = junoutils.scaleDf(subset_values)

isft = sklearn.ensemble.IsolationForest(contamination=0.001, max_features=10, n_estimators=200)
isft.fit(scaled_values)
preds = isft.predict(scaled_values) == -1
emails = subset[preds].email
emails

In [None]:
nodes = ca_state_summary[['email','state','relative_proportion']]
nodes['id'] = nodes['state']
nodes['size'] = nodes['relative_proportion']
nodes = nodes[['email','id','size']]
nodes.to_csv('data/nodes.csv', index=False)

links = ca_transition_summary[['email','origin','destination','relative_proportion']]
links['source'] = links['origin']
links['target'] = links['destination']
links['thickness'] = links['relative_proportion']
links = links[['email','source','target','thickness']]
links.to_csv('data/links.csv', index=False)

In [None]:
user_node_json('insert user email here', nodes).replace('"id"','id').replace('"size"','size')


In [None]:
user_link_json('insert user email here', links).replace('"source"','source').replace('"target"','target').replace('"thickness"','thickness')

In [None]:
states = sorted(list(set(list(ca.origin.unique()) + list(ca.destination.unique()))))
user_transition_matrix = calculate_transition_maxtrix(ca[ca.email == 'insert user email here'], states)
comparative_user_matrix = np.nan_to_num(user_transition_matrix/transition_matrix)
comparative_user_matrix

In [None]:
ca[ca.email == 'insert user email here'].destination.unique()

In [None]:
row = comparative_user_matrix.argmax()//comparative_user_matrix.shape[0]
col = comparative_user_matrix.argmax()%comparative_user_matrix.shape[0]

print(row, col)
print(states[row], " to ", states[col])

In [None]:
def calculate_state_proportions(df, states):

    counts = []
    
    for state in states:
        counts.append(df[df.destination == state].shape[0])

    counts = np.array(counts)/np.sum(counts)
    
    return counts

all_props = calculate_state_proportions(ca, states)
user_props = calculate_state_proportions(ca[ca.email == 'insert user email here'], states)
relative = user_props/all_props
relative = relative/np.sum(relative)

nodes = pd.DataFrame({'index': np.arange(len(states)), 'name': states, 'size': relative*100})
nodes.to_dict('records')

json.dumps(nodes.to_dict('records')).replace('"index"','index').replace('"name"','name').replace('"size"','size')

In [None]:
links = []

for i, org in enumerate(states):
    for j, des in enumerate(states):
        if des != 'login_auth' and comparative_user_matrix[i,j] > 0:
            links.append({'source': i, 'target': j, 'thickness': comparative_user_matrix[i,j]*100})

json.dumps(links).replace('"source"','source').replace('"target"','target').replace('"thickness"','thickness')

In [None]:
len(s.email.unique())         

In [None]:
import time

n = 100

now = time.time()

user_dict = {}

for user in s.email.unique()[0:n]:
    
    user_dict[user] = calculate_comparative_user_matrix(df=ca, email=user, states=states, transition_matrix=transition_matrix)


length = time.time() - now
print("Estimated time to calculate:  ", length/n*len(s.email.unique()))

In [None]:
def calc_user_matrix(df, email=None):

    if email != None:
        df = df[df.email == email]
        gb = df.groupby(['origin', 'destination'], as_index=False).count()
        gb['count'] = gb['created']
        gb = gb[['origin','destination','count']]
        gb['email'] = email
        
    else:
        gb = df.groupby(['origin', 'destination'], as_index=False).count()
        gb['count'] = gb['created']
        gb = gb[['origin','destination','count']]


    return gb

In [None]:
calc_user_matrix(ca, 'testing@einstein.exchange')