In [None]:
%load_ext autoreload

In [None]:
%autoreload

import junodb, junoutils, junoplots
import time
import pandas as pd
import datetime

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.tools import FigureFactory as ff

import numpy as np

init_notebook_mode(connected=True)

%matplotlib inline
import networkx as nx  # For the magic
import matplotlib.pyplot as plt
import graphviz
import pydot

from IPython.display import Image, display

def viewPydot(pdot):
    plt = Image(pdot.create_png())
    display(plt)

In [None]:
# read in the stored data
edf = pd.read_csv('all_events.csv')

In [None]:
udf = pd.read_csv('all_users.csv')

In [None]:
# get all the user events for the testing account in the last 10 days
date = datetime.datetime.now() - datetime.timedelta(days=10)
user_df = junoutils.processEvents(junodb.getUserEvents(email='testing@einstein.exchange', date=date))
junoutils.summarizeUserEvents(user_df=user_df)

In [None]:
# clean up the events
subsetdf = junoutils.cleanUpEvents(edf)

In [None]:
events_summary = junoutils.createUserEventSummaries(clean_df=subsetdf)

In [None]:
junoutils.savePickle(events_summary, 'events_data.pickle')

In [None]:
summary_df = events_summary['summary_df']

In [None]:
# simplify the aggregates to remove columns that don't contain valuable info
simp = junoutils.simplifyAgg(df=summary_df)

In [None]:
simp_data = simp.drop(axis=1, labels=['email'])

dropidx = list(simp_data.index[np.isinf(simp_data).any(1)])
a = simp.drop(dropidx)
b = simp_data.drop(dropidx)

emb = junoutils.calculatetSNEEmbeddings(b)

a['tSNE_x'] = emb[:,0]
a['tSNE_y'] = emb[:,1]

In [None]:
kmeans_pca = junoutils.calculatetKMeans(df=b, pca=True)
kmeans = junoutils.calculatetKMeans(df=b, pca=False)
a['kmeans_pca_label'] = kmeans_pca
a['kmeans_label'] = kmeans

In [None]:
pca2 = junoutils.decomposition.PCA(n_components=2).fit_transform(b.values)
a['pca_1'] = pca2[:,0]
a['pca_2'] = pca2[:,1]

In [None]:
kmeans_on_tSNE = junoutils.calculatetKMeans(df=a[['tSNE_x','tSNE_y']], pca=False)
a['kmeans_tSNE_label'] = kmeans_on_tSNE

In [None]:
import pickle

with open('cluster_summary.pickle', 'wb') as handle:
        pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
cluster_summary = junoutils.openPickle('cluster_summary.pickle')

# Creating User Action Layouts/Diagrams

In [None]:
actions = subsetdf[['created',
                    'email',
                    'category_label_action',
                    'category_action']].sort_values(by=['email','created'])

actions = actions[actions.email.isnull() == False]
yah = actions[actions.email == 'put specific user email here']

In [None]:
yah['cla_lag1'] = yah.category_label_action.shift(1)
yah['ca_lag1'] = yah.category_action.shift(1)

In [None]:
cla = yah[['category_action','ca_lag1']]
cla['target'] = cla.category_action
cla['source'] = cla.ca_lag1
cla = cla[['source','target']]
cla = cla[cla.source.isnull() == False]

print(cla.head())

cla['unordered_pair'] = cla.apply(lambda row: ' '.join(sorted([row['source'], row['target']])), axis=1)

cla['ordered_pair'] = cla.source+" "+cla.target

pairs = cla.drop_duplicates()
pairs = pairs[pairs.source != pairs.target]
pairs = pd.DataFrame({'pair': pairs['unordered_pair'].unique()})

pairs['count'] = pairs.pair.apply(lambda x: np.sum(cla.unordered_pair == x))
pairs['source'] = pairs.pair.apply(lambda x: x.split(' ')[0])
pairs['target'] = pairs.pair.apply(lambda x: x.split(' ')[1])

pairs['link_color'] = "rgba(0,0,96,0.2)"
pairs['node_color'] = "rgba(31, 119, 180, 0.8)"

pairs = pairs.sort_values(by='source').reset_index(drop=True)
pairs

mapper = {}
idx = 0

node_labels = sorted(set(list(pairs.source.unique()) + list(pairs.target.unique())))

for s in node_labels:
    mapper[s] = idx
    idx += 1
    
pairs['src_idx'] = pairs.source.apply(lambda x:  mapper[x])
pairs['target_idx'] = pairs.target.apply(lambda x:  mapper[x])
pairs

In [None]:
data_trace = go.Sankey(
    domain = dict(
        x =  [0,1],
        y =  [0,1]
    ),
    orientation = "h",
    valueformat = ".0f",
    valuesuffix = "",
    node = dict(
        pad = 15,
        thickness = 15,
        line = dict(
            color = "black",
            width = 0.5
        ),
        label = node_labels
    ),
    link = dict(
        source = list(pairs['src_idx'].values),
        target = list(pairs['target_idx'].values),
        value =  list(pairs['count'].values),
    )
)

layout =  go.Layout(
    title = "Consumer Events",
    font = dict(
      size = 10
    )
)

fig = go.Figure(data=[data_trace], layout=layout)
iplot(fig, validate=False)

In [None]:
G = nx.MultiDiGraph()
labels={}
edge_labels={}
states = list(sorted(set(list(cla.source.unique()) + list(cla.target.unique()))))

for i, origin_state in enumerate(states):
    for j, destination_state in enumerate(states):
        source = cla[cla.source == origin_state]
        rate = source[source.target == destination_state].shape[0]
        if rate > 0:
            G.add_edge(origin_state,
                       destination_state,
                       weight=rate,
                       label="{:.02f}".format(rate))
            edge_labels[(origin_state, destination_state)] = label="{:.02f}".format(rate)
            
dot = nx.nx_pydot.to_pydot(G)
viewPydot(dot)

In [None]:
cla1 = yah[['category_label_action','cla_lag1']]
cla1['target'] = cla1.category_label_action
cla1['source'] = cla1.cla_lag1
cla1 = cla1[['source','target']]
cla1 = cla1[cla1.source.isnull() == False]


G = nx.MultiDiGraph()
labels={}
edge_labels={}
states = list(sorted(set(list(cla1.source.unique()) + list(cla1.target.unique()))))

for i, origin_state in enumerate(states):
    for j, destination_state in enumerate(states):
        source = cla1[cla1.source == origin_state]
        rate = source[source.target == destination_state].shape[0]
        if rate > 0:
            G.add_edge(origin_state,
                       destination_state,
                       weight=rate,
                       label="{:.02f}".format(rate))
            edge_labels[(origin_state, destination_state)] = label="{:.02f}".format(rate)
            
dot = nx.nx_pydot.to_pydot(G)
viewPydot(dot)

In [None]:
subsetdf[subsetdf.email.isnull() == True].category_label_action.unique()

In [None]:
for col in subsetdf.columns:
    if 'url' in col:
        print(col)

In [None]:
summary = junoutils.openPickle('data/summary_data_20180221134012.pickle')

In [None]:
sdf = summary['summary_df']
sdf = junoutils.simplifyAgg(sdf)

In [None]:
sdf_data = sdf.drop(labels=['email','value_std'], axis=1)
dropidx = list(sdf_data.index[np.isinf(sdf_data).any(1)])
sdf_clean = sdf.drop(dropidx)
sdf_data_clean = sdf_data.drop(dropidx)

In [None]:
tSNE = junoutils.calculatetSNEEmbeddings(df=sdf_data, pca=False)
sdf_clean['tSNE_x'] = tSNE[:,0]
sdf_clean['tSNE_y'] = tSNE[:,1]

In [None]:
junoutils.savePickle(event_dict=sdf_clean, name='summary_withemb.pickle')

In [None]:
for email in user_emails:
    
    
    n = np.sum(edf[edf.email == email]['metadata.fraudulent'] == True)
    
    if n > 0:
        print(email, n)

In [None]:
fraudsters = pd.read_csv('data/suspected_fraudsters.csv', header=None)
fraudsters['email'] = fraudsters[0]
fraudsters = fraudsters[['email']]

emails = []

for l in fraudsters.email:
    if ',' in l:
        l = l.split(',')
        for s in l:
            s = s.replace('\n','').replace(' ','')
            emails.append(s)
    else:
        emails.append(l)
        
fraudsters = pd.DataFrame({'email': emails})

emails = fraudsters.email.unique()



sdf_clean['suspected_fraud'] = False
sdf_clean['fraud_count'] = 0

for email in emails:
    
    sdf_clean['suspected_fraud'][sdf_clean.email == email] = True
    
for f in fraud_counts:
    sdf_clean['fraud_count'][sdf_clean.email == f[0]] = f[1]
    
sdf_clean['suspected_fraud'][sdf_clean['fraud_count'] > 0] = True

sdf_clean[sdf_clean['suspected_fraud'] == True]

In [None]:
junoutils.savePickle(event_dict=sdf_clean, name='summary_withemb.pickle')

In [None]:
fraudsters = sdf_clean[sdf_clean.suspected_fraud == True][['email','suspected_fraud','fraud_count']].sort_values(by='fraud_count', ascending=False).reset_index(drop=True)
fraudsters.to_csv('fraudsters.csv')

In [None]:
users = list(sdf_clean.email.dropna().unique())
users

In [None]:
def similarityWithAllUsers(email):
    similarity = np.array([junoutils.similarityRatio(email,user) for user in users])
    sort_index = similarity.argsort()[::-1]
    sorted_similarity = similarity[sort_index]
    sorted_users = np.array(users)[sort_index]
    very_similar = sorted_users[sorted_similarity >= 0.9]

    if len(very_similar) > 1:
        return list(very_similar[1:])
    else:
        return []
    

In [None]:
fraudsters['alias_emails'] = fraudsters.email.apply(similarityWithAllUsers)

In [None]:
emails = []
found_from = []


for email in fraudsters.email:
    
    simlist = similarityWithAllUsers(email)
    if len(simlist) > 0:
        for email_new in list(simlist):
            if email_new not in emails:
                found_from.append(email)
                emails.append(email_new)
                
expanded_list = pd.DataFrame({'email': emails, 'similar_to': found_from})

In [None]:
expanded_list.to_html('similar_emails.html')

In [None]:
expanded_list