In [86]:
import random
import re
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm, trange
from gensim.models.word2vec import Word2Vec
import numpy as np

import networkx as nx
from networkx.algorithms.components.connected import connected_components

from stellargraph import StellarGraph
from pyvis.network import Network

from IPython.core.display import display, HTML

import joblib
from sklearn import cluster
from sklearn import metrics

import copy
import time
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.callbacks import CallbackAny2Vec

import stellargraph as sg
import multiprocessing

from sklearn.metrics import roc_auc_score
from sklearn.tree import _tree

from sklearn.metrics import confusion_matrix
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn import cluster
from sklearn import metrics

import os
import joblib
import pacmap

In [87]:

def getRandomWalk(graph,node,length_of_random_walk):
    """ This function takes NetworkX Graph and a Node and generate random walk for a given length 
    
    Returns the random walk (list of nodes traversed)

    Note: The same node may occcur more than once in a Random Walk.
    """
    start_node=node
    current_node=start_node
    random_walk=[node]
    for i in range(0,length_of_random_walk):
        ## Choose a random neighbour of the current node
        
        current_node_neighbours=list(graph.neighbors(current_node))
        chosen_node=random.choice(current_node_neighbours)
        current_node=chosen_node
        random_walk.append(current_node)
    return random_walk



def get_rules(tree, feature_names, categorical_names, class_names):
    '''Function to read the decitions tree and convert it into readbale format '''

    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            raw_val = categorical_names[name][int(np.floor(threshold))]
            p1 += [f"({name} <= {raw_val})"]
            # p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            raw_val = categorical_names[name][int(np.floor(threshold))]
            p2 += [f"({name} > {raw_val})"]
            # p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        rules += [rule]
        
    return rules


# Using word2vec to convert the random walks into embeddings
class callback(CallbackAny2Vec):
    '''Callback for Word2vec with resetting loss on the end of each epoch.'''

    def __init__(self):
        self.epoch = 1
        self.losses = []
        self.cumu_loss = 0.0
        self.previous_epoch_time = time.time()

        self.best_model = None
        self.best_loss = 1e+30

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()

        norms = [np.linalg.norm(v) for v in model.wv.vectors]
        now = time.time()
        epoch_seconds = now - self.previous_epoch_time
        self.previous_epoch_time = now
        self.cumu_loss += float(loss)
        print(f"Loss after epoch {self.epoch}: {loss} (cumulative loss so far: {self.cumu_loss}) "+\
              f"-> epoch took {round(epoch_seconds, 2)} s - vector norms min/avg/max: "+\
              f"{round(float(min(norms)), 2)}, {round(float(sum(norms)/len(norms)), 2)}, {round(float(max(norms)), 2)}")
        self.epoch += 1

        self.losses.append(float(loss))

        # reset loss inside model
        model.running_training_loss = 0.0

        if loss < self.best_loss:
            self.best_model = copy.deepcopy(model)
            self.best_loss = loss

        if self.epoch % 10 == 0:
            self.plot(path="w2v_training_loss.png")

    def plot(self, path):
        fig, (ax1) = plt.subplots(ncols=1, figsize=(6, 6))
        ax1.plot(self.losses, label="loss per epoch")
        plt.legend()
        plt.savefig(path)
        plt.show()
        plt.close()
        print("Plotted loss.")        

# Get data

In [88]:
dataset_name = 'audience_imputed.csv'

In [89]:
# Load data
df = pd.read_csv(dataset_name) #, nrows=705)
df = df[df['Social network'] == 'Instagram']
print(df.shape)
# df = df[669:]
df = df[df.columns[0:57]]
df = df.reset_index(drop=True)
df.head()

(20651, 215)



Columns (31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,58,79,83,89,91,93,95,97,99,101,103,105,153) have mixed types.Specify dtype option on import or set low_memory=False.



Unnamed: 0,Social network,Username,Name,Country,City,Birthdate,Gender,Followers,Following,Posts,...,Profile interest 17,Profile interest 18,Profile interest 19,Profile interest 20,Profile interest 21,Profile interest 22,Profile interest 23,Profile interest 24,Profile interest 25,Audience credibility
0,Instagram,usher,Usher,United States,Los Angeles,,,9350055,1066,468,...,,,,,,,,,,0.754
1,Instagram,konstantinbaum_mw,Master of Wine🍷,Germany,,,Male,19722,6032,760,...,,,,,,,,,,0.8403
2,Instagram,eduardo.madeira8,Eduardo Madeira,Portugal,,25/02/72,Male,420384,1001,3719,...,,,,,,,,,,0.7831
3,Instagram,nunomarkl,Nuno Markl,Portugal,Lisboa,21/07/71,Male,799478,2353,11490,...,,,,,,,,,,0.884788
4,Instagram,paulmccartney,Paul McCartney,,,,Male,3626855,21,807,...,,,,,,,,,,0.7875


# Define data elements

In [90]:
numerical_columns = df._get_numeric_data().columns.values.tolist()
print(numerical_columns)
entity_column = 'Username'
target_column= 'Audience credibility'
datetime_columns = ''

['Followers', 'Following', 'Posts', 'Engagement', 'Estimated reach', 'Estimated impressions', 'Avg. posts per week', 'Avg. posts per month', 'Avg. likes per post', 'Avg. engagement per post', 'Avg. comments per post', 'Avg. views per video', 'Cost per post (MIN)', 'Cost per post (MAX)', 'Post CPM (MIN)', 'Post CPM (MAX)', 'Post CPE (MIN)', 'Post CPE (MAX)', 'Cost per story (MIN)', 'Cost per story (MAX)', 'Story CPM (MIN)', 'Story CPM (MAX)', 'Story CPE (MIN)', 'Story CPE (MAX)', 'Profile interest 24', 'Profile interest 25', 'Audience credibility']


In [91]:
if entity_column == '':
    df['entity_column']= df.index.tolist()
    entity_column = 'entity_column' 
    
if datetime_columns == '':
    df['datetime_columns']= df.index.tolist()    
    datetime_columns = 'datetime_columns'
    
df_new = df.copy()

In [92]:
categorical_columns = list(set(df.columns.values.tolist()) - set(numerical_columns))
categorical_columns[0:3]

['Profile interest 18', 'Profile interest 12', 'Profile interest 16']

# Create edge and node data

In [94]:
#Handel Regression
if df[target_column].nunique() > 50:
    df[target_column] = df[target_column].fillna(0).astype(float)
    df[target_column] = pd.cut(df[target_column],10,duplicates='drop')
    if target_column in categorical_columns:
        categorical_columns.remove(target_column)
    if target_column in numerical_columns:
        numerical_columns.remove(target_column)


In [95]:
    
df[numerical_columns] = df[numerical_columns].astype(float)
# Replace nans with Unknown
df[numerical_columns] = df[numerical_columns].fillna(0)
df[categorical_columns] = df[categorical_columns].fillna('Unknown')

    
if target_column in numerical_columns:
    numerical_columns.remove(target_column)
    
# Convert ID column and target column into strings 
# df[entity_column] = df[entity_column].apply(lambda x: 'entity_id_'+str(x)) # For entity nodes
df[entity_column] = 'entity_id_' + df[entity_column].astype(str) + '__' + df[datetime_columns].astype(str)
df[target_column] = df[target_column].apply(lambda x: 'label_'+str(x))  # For label nodes

# Converting numerical columns into categorical by performing
# 1. Binning the numerical columns into quartiles
# 2. Concat the values with column name
for column_name in numerical_columns:
    df[column_name] = pd.cut(df[column_name],10,duplicates='drop')
#     df[column_name] = df[column_name].fillna('Unknown')
    df[column_name] = column_name + '_' + df[column_name].astype(str)

for column_name in categorical_columns:
    if entity_column in column_name:
        pass
    else:
        df[column_name] = column_name + '_' + df[column_name].astype(str)

# Remove some columns
# if exclude_columns:
#     df = df.drop(columns=exclude_columns)    


def get_node_data():
    """Create Node data for the Graph using Categorical columns."""

    numeric_df = df[numerical_columns]
    numeric_df.index = df[entity_column]


    all_columns = df.columns.tolist()
    all_columns.remove(entity_column)


    # Setting default value to the nodes attributes, this iwll change in future
    numeric_df[:] = 100

    for col in all_columns:
        df[col] = df[col].replace(to_replace ="Unknown",value =f'{col}_Unknown')

    cat_features = pd.DataFrame(columns=numerical_columns)
    for col in all_columns:
        for cat_val in df[col].unique():
            cat_features.loc[str(cat_val)] = 100
            # cat_features.loc[cat_val] = df[df[col] == cat_val].drop(columns=[entity_column,col]).mean().round(2)[numerical_columns]

    node_data = numeric_df.append(cat_features)
    # TODO: This is hacky way to remove duplicate nodes
    node_data = node_data[~node_data.index.duplicated(keep='first')]
    return node_data

node_data = get_node_data()

def get_edge_data():
    """ Create edge list for the Graph using categorical columns """

    edges_data_all_cat = pd.DataFrame(columns = ['source','target'])

    all_columns = df.columns.tolist()
    all_columns.remove(entity_column)

    for col in all_columns:
        categorical_df = df[[entity_column,col]]
        categorical_df = categorical_df.rename(columns={entity_column: 'source', col: 'target'})
        edges_data_all_cat = pd.concat([edges_data_all_cat,categorical_df])
        edges_data_all_cat = edges_data_all_cat.reset_index(drop=True)

    edges_data_all_cat = edges_data_all_cat.astype('str')
    return edges_data_all_cat

edges_data = get_edge_data()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [96]:
df.head()

Unnamed: 0,Social network,Username,Name,Country,City,Birthdate,Gender,Followers,Following,Posts,...,Profile interest 18,Profile interest 19,Profile interest 20,Profile interest 21,Profile interest 22,Profile interest 23,Profile interest 24,Profile interest 25,Audience credibility,datetime_columns
0,Social network_Instagram,entity_id_usher__0,Name_Usher,Country_United States,City_Los Angeles,Birthdate_Unknown,Gender_Unknown,"Followers_(-406822.054, 40682205.4]","Following_(756.1, 1512.2]","Posts_(-22.98, 2298.0]",...,Profile interest 18_Unknown,Profile interest 19_Unknown,Profile interest 20_Unknown,Profile interest 21_Unknown,Profile interest 22_Unknown,Profile interest 23_Unknown,"Profile interest 24_(-0.0002, 0.0]","Profile interest 25_(-0.0002, 0.0]","label_(0.701, 0.788]",datetime_columns_0
1,Social network_Instagram,entity_id_konstantinbaum_mw__1,Name_Master of Wine🍷,Country_Germany,City_Unknown,Birthdate_Unknown,Gender_Male,"Followers_(-406822.054, 40682205.4]","Following_(5292.7, 6048.8]","Posts_(-22.98, 2298.0]",...,Profile interest 18_Unknown,Profile interest 19_Unknown,Profile interest 20_Unknown,Profile interest 21_Unknown,Profile interest 22_Unknown,Profile interest 23_Unknown,"Profile interest 24_(-0.0002, 0.0]","Profile interest 25_(-0.0002, 0.0]","label_(0.788, 0.874]",datetime_columns_1
2,Social network_Instagram,entity_id_eduardo.madeira8__2,Name_Eduardo Madeira,Country_Portugal,City_Unknown,Birthdate_25/02/72,Gender_Male,"Followers_(-406822.054, 40682205.4]","Following_(756.1, 1512.2]","Posts_(2298.0, 4596.0]",...,Profile interest 18_Unknown,Profile interest 19_Unknown,Profile interest 20_Unknown,Profile interest 21_Unknown,Profile interest 22_Unknown,Profile interest 23_Unknown,"Profile interest 24_(-0.0002, 0.0]","Profile interest 25_(-0.0002, 0.0]","label_(0.701, 0.788]",datetime_columns_2
3,Social network_Instagram,entity_id_nunomarkl__3,Name_Nuno Markl,Country_Portugal,City_Lisboa,Birthdate_21/07/71,Gender_Male,"Followers_(-406822.054, 40682205.4]","Following_(2268.3, 3024.4]","Posts_(9192.0, 11490.0]",...,Profile interest 18_Unknown,Profile interest 19_Unknown,Profile interest 20_Unknown,Profile interest 21_Unknown,Profile interest 22_Unknown,Profile interest 23_Unknown,"Profile interest 24_(-0.0002, 0.0]","Profile interest 25_(-0.0002, 0.0]","label_(0.874, 0.961]",datetime_columns_3
4,Social network_Instagram,entity_id_paulmccartney__4,Name_Paul McCartney,Country_Unknown,City_Unknown,Birthdate_Unknown,Gender_Male,"Followers_(-406822.054, 40682205.4]","Following_(-7.561, 756.1]","Posts_(-22.98, 2298.0]",...,Profile interest 18_Unknown,Profile interest 19_Unknown,Profile interest 20_Unknown,Profile interest 21_Unknown,Profile interest 22_Unknown,Profile interest 23_Unknown,"Profile interest 24_(-0.0002, 0.0]","Profile interest 25_(-0.0002, 0.0]","label_(0.701, 0.788]",datetime_columns_4


In [97]:
# df.to_csv('fraud_graph_processed_data.csv',index=False)

In [98]:
edges_data.head()

Unnamed: 0,source,target
0,entity_id_usher__0,Social network_Instagram
1,entity_id_konstantinbaum_mw__1,Social network_Instagram
2,entity_id_eduardo.madeira8__2,Social network_Instagram
3,entity_id_nunomarkl__3,Social network_Instagram
4,entity_id_paulmccartney__4,Social network_Instagram


In [99]:
node_data.head()

Unnamed: 0,Followers,Following,Posts,Engagement,Estimated reach,Estimated impressions,Avg. posts per week,Avg. posts per month,Avg. likes per post,Avg. engagement per post,...,Post CPE (MIN),Post CPE (MAX),Cost per story (MIN),Cost per story (MAX),Story CPM (MIN),Story CPM (MAX),Story CPE (MIN),Story CPE (MAX),Profile interest 24,Profile interest 25
entity_id_usher__0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
entity_id_konstantinbaum_mw__1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
entity_id_eduardo.madeira8__2,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
entity_id_nunomarkl__3,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
entity_id_paulmccartney__4,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100


# Generate Graph Embeddings

In [100]:
#%%time
G = sg.StellarGraph(node_data, edges_data)


In [101]:
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 63033, Edges: 1177107

 Node types:
  default: [63033]
    Features: float32 vector, length 26
    Edge types: default-default->default

 Edge types:
    default-default->default: [1177107]
        Weights: all 1 (default)
        Features: none


In [102]:
import multiprocessing
from joblib import Parallel, delayed

In [103]:
%%time

# Random walk

random_walks=[]
num_sampling=3
length_of_random_walk=80
retrain = True
experiment = 'socialtalk_onimputed'

if not os.path.isfile(f'{dataset_name}_{experiment}.pkl'):
    random_walks = Parallel(n_jobs=8,backend="multiprocessing")(delayed(getRandomWalk)(G,node,length_of_random_walk) for node in tqdm(G.nodes(),desc="Randomwalk Progress") for i in range(0,num_sampling))
#     for node in tqdm(G.nodes(),desc="Iterating Nodes"):
#         for i in range(0,num_sampling):
#             random_walks.append(getRandomWalk(G,node,length_of_random_walk))

    joblib.dump(random_walks, f'{dataset_name}_{experiment}.pkl')        
else:
    if retrain:
        random_walks = Parallel(n_jobs=8,backend="multiprocessing")(delayed(getRandomWalk)(G,node,length_of_random_walk) for node in tqdm(G.nodes(),desc="Randomwalk Progress") for i in range(0,num_sampling))
#         for node in tqdm(G.nodes(),desc="Iterating Nodes"):
#             for i in range(0,num_sampling):
#                 random_walks.append(getRandomWalk(G,node,length_of_random_walk))

        joblib.dump(random_walks, f'{dataset_name}_{experiment}.pkl')        
    else:
        random_walks=joblib.load(f'{dataset_name}_{experiment}.pkl')        

Randomwalk Progress:   0%|          | 0/63033 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



CPU times: user 53min 55s, sys: 1min 58s, total: 55min 53s
Wall time: 1h 40min 22s


In [104]:
%%time
EMBEDDING_DIM = 128
EPOCH = 1
N_GRAMS = 7

if not os.path.isfile(f'{dataset_name}_{experiment}.pkl'): 
    deepwalk_model = Word2Vec(sentences=random_walks, epochs= EPOCH, vector_size = EMBEDDING_DIM, workers=multiprocessing.cpu_count(), \
                        negative=6,min_count=1,sg=1, window=N_GRAMS, compute_loss=True, callbacks=[callback()])

    joblib.dump(deepwalk_model, f'{dataset_name}_{experiment}.pkl')
else:
    if retrain:
        deepwalk_model = Word2Vec(sentences=random_walks, epochs= EPOCH, vector_size = EMBEDDING_DIM, workers=multiprocessing.cpu_count(), \
                            negative=6,min_count=1,sg=1, window=N_GRAMS, compute_loss=True, callbacks=[callback()])

        joblib.dump(deepwalk_model, f'{dataset_name}_{experiment}.pkl')
    else:
        deepwalk_model = joblib.load(f'{dataset_name}_{experiment}.pkl')


Loss after epoch 1: 23599944.0 (cumulative loss so far: 23599944.0) -> epoch took 34.04 s - vector norms min/avg/max: 0.05, 1.25, 6.25
CPU times: user 3min 58s, sys: 20.1 ms, total: 3min 58s
Wall time: 35.1 s


In [105]:

# deepwalk_model=joblib.load(f'audience.csv_socialtalk_101.pkl')        

# Entity Community

In [106]:
# Get the embedding for each unique entity id
node_embedding = []
node_id_lookup = []
label_per_node = []
for index, row in df[[entity_column,target_column]].iterrows():
    if row[entity_column] in deepwalk_model.wv.index_to_key:
        node_id_lookup.append(row[entity_column])
        node_embedding.append(deepwalk_model.wv.get_vector(row[entity_column]))
        label_per_node.append(index)


In [107]:
node_embedding_df = pd.DataFrame(node_embedding,index=node_id_lookup)
node_embedding_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
entity_id_usher__0,0.051704,-0.441702,0.223179,-0.046023,0.096403,-0.069842,-0.00748,0.028501,-0.050415,-0.039778,...,0.070962,0.103695,-0.076198,-0.269681,-0.087614,0.082055,-0.097652,-0.000143,-0.139401,0.068005
entity_id_konstantinbaum_mw__1,0.067526,-0.401011,0.274923,-0.065153,0.046456,-0.005791,-0.021575,0.043332,-0.065683,-0.165094,...,0.021496,0.085596,-0.112072,-0.226378,-0.012699,0.159983,-0.133412,-0.01672,-0.126029,0.029561
entity_id_eduardo.madeira8__2,-0.108507,-0.379061,0.119555,0.066008,0.168304,-0.120885,0.105425,-0.122328,-0.157518,0.159991,...,0.028178,0.151696,-0.171078,-0.262288,-0.230272,-0.069914,-0.062065,0.07475,-0.168929,0.061306
entity_id_nunomarkl__3,-0.084522,-0.366864,0.073166,0.093429,0.189576,-0.01483,-0.002777,-0.177936,-0.171409,0.156992,...,0.093533,0.148324,-0.163257,-0.297181,-0.191129,-0.11694,-0.046851,0.090313,-0.099341,0.085255
entity_id_paulmccartney__4,0.039161,-0.412669,0.299866,-0.083511,0.105775,-0.104666,-0.086239,0.00921,-0.090153,-0.16363,...,0.019319,0.085932,-0.128984,-0.234475,0.011872,0.221023,-0.169408,0.059226,-0.114163,0.052051


In [108]:
node_embedding_df.to_csv(f'user_embeddings_{experiment}.csv')

In [109]:
dim_reduction = pacmap.PaCMAP(n_dims=3, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0,lr=0.05,num_iters=1000) 
# dim_reduction = pacmap.PaCMAP(n_dims=3, n_neighbors=None,lr=0.05,num_iters=450) 
transformed_2d = dim_reduction.fit_transform(node_embedding_df, init="pca")

coordinates_per_entity = pd.DataFrame(transformed_2d,columns=['x','y','z'], index = node_id_lookup)
coordinates_per_entity.head()

Unnamed: 0,x,y,z
entity_id_usher__0,9.996381,-8.413579,-0.143479
entity_id_konstantinbaum_mw__1,1.164275,-0.589899,0.971701
entity_id_eduardo.madeira8__2,6.720194,5.728224,-1.62641
entity_id_nunomarkl__3,7.061954,6.387723,-1.476747
entity_id_paulmccartney__4,-2.047309,-5.04505,3.182101


In [110]:
import plotly.express as px

In [111]:
px.scatter_3d(coordinates_per_entity, x='x', y='y', z='z', color = df['Audience credibility'],title="Embeddings",
             hover_name=df[entity_column]) 




# Lets look at the all Audience credibility users ⬆️ 

In [114]:
users= '''nunomarkl
vanesssamartins
acatmoreira
dulceida
mariapombo
dulceida
martabaceiredo
dulceida
_raquelsampaio_
saraetdsousa19
jessicavferreirasilva
lacadenasaludable'''.split()

similarvectorforuser = {}
for u in users:
    try:
        user = node_embedding_df.filter(like=u, axis=0) 
        print(f' For user {user.index[0]}')
        v = user.values
        similarvectorforuser[u] = deepwalk_model.wv.similar_by_vector(v[0], topn=20)
    except:
        print(f'No record for {user.index[0]}')
        
        
        

 For user entity_id_nunomarkl__3
 For user entity_id_vanesssamartins__13
 For user entity_id_acatmoreira__42
 For user entity_id_dulceida__46
 For user entity_id_mariapombo__47
 For user entity_id_dulceida__46
 For user entity_id_martabaceiredo__429
 For user entity_id_dulceida__46
 For user entity_id__raquelsampaio___96
 For user entity_id_saraetdsousa19__155
 For user entity_id_jessicavferreirasilva__213
 For user entity_id_lacadenasaludable__483


In [115]:
similarvectorforuser


{'nunomarkl': [('entity_id_nunomarkl__3', 0.9999999403953552),
  ('entity_id_sousacines__654', 0.9742760062217712),
  ('entity_id_miguelfersou__484', 0.973259687423706),
  ('entity_id_vanesssamartins__13', 0.970992386341095),
  ('entity_id_cucinaperte__370', 0.9702286124229431),
  ('entity_id_ruimoura_temple__191', 0.9702126383781433),
  ('entity_id_rifasricardo__206', 0.9696375727653503),
  ('entity_id_goncalocroque__227', 0.9689686894416809),
  ('entity_id_selmarosas___157', 0.9679760336875916),
  ('entity_id_ednabarross__638', 0.9679301977157593),
  ('entity_id_marisel.andriasevich__583', 0.9678745269775391),
  ('entity_id_rita_gdiniz__109', 0.9673539996147156),
  ('entity_id_catarinafurtadooficial__258', 0.9665971994400024),
  ('entity_id_ivocabaco__210', 0.9665578007698059),
  ('entity_id_katebyednenko__141', 0.9659880995750427),
  ('entity_id_carolgaillard__437', 0.9658284187316895),
  ('entity_id_nachurod__470', 0.9656393527984619),
  ('entity_id_maria_cerqueira_gomes__260', 0.9

# Appendix 

# Louvain based community

In [None]:
from cdlib import algorithms
import networkx as nx
G1 = G.to_networkx()

coms = algorithms.louvain(G1, weight='weight', resolution=1., randomize=False)
print(f'We detected {len(coms.communities)} communities in {coms.method_name}.')
for i in range(len(coms.communities)):
    print(f'Where Community {i} is of size {len(coms.communities[i])}')


In [None]:
## Get the entities mapped with com labels

In [None]:
community_df = pd.DataFrame(coms.to_node_community_map()).T
community_df=community_df.reset_index()

In [None]:
community_df

In [None]:
entity_community_df = community_df[community_df["index"].str.contains('entity_id',case=False)]
entity_community_df.index = entity_community_df['index']
entity_community_df=entity_community_df.drop(columns=["index"])
entity_community_df.head()

In [None]:
entity_community_df

In [None]:
entity_com_with_com_label = pd.merge(coordinates_per_entity, entity_community_df, left_index=True, right_index=True)
entity_com_with_com_label.rename(columns = {0:'comm_label'}, inplace = True)
entity_com_with_com_label.head()

In [None]:
entity_com_with_com_label

In [None]:
sns.scatterplot(x='x',
                y='y',
                hue='comm_label',
                data = entity_com_with_com_label.sample(1000),
#                 style='comm_label',
                palette='Set3')
plt.show()

In [None]:
entity_com_with_com_label

In [None]:
# Neigherst neighbour

In [None]:

labelwisedf.index.tolist()

In [None]:
nn_distrib={}
for label in range(entity_com_with_com_label['comm_label'].nunique()):
#     print(f'In label={label}')
    labelwisedf = entity_com_with_com_label[entity_com_with_com_label['comm_label']== label]
#     print(f'Size={labelwisedf.shape}')    
    common_nn_on_label=[]
    for key in labelwisedf.index.tolist():
        cnn = pd.DataFrame(deepwalk_model.wv.similar_by_key(key,topn=5000), columns=['keys','prob'])
        non_ent_keys = cnn[~cnn['keys'].str.contains('entity_id_|merchant_|step_', regex=True)]
        common_nn_on_label.append(non_ent_keys)
        
    com1_nn=pd.concat(common_nn_on_label)
#     print(com1_nn['keys'].value_counts()[0:10])
    nn_distrib[label] = com1_nn['keys'].value_counts()[0:10]
    print(f'We got community {label}, composed of {labelwisedf.shape[0]} entities with following top 3 properties {", ".join(com1_nn["keys"].value_counts().index[0:3].tolist())}')

In [None]:
com1_nn['prob'].value_counts().index[0:3].tolist()

In [None]:
com1_nn=pd.concat(common_nn_on_label)

In [None]:
com1_nn_distribution = com1_nn.value_counts()[0:10]

In [None]:
df_new[entity_column] = 'entity_id_' + df_new[entity_column].astype(str) + '__' + df_new[datetime_columns].astype(str)
df_new.index = df_new[entity_column]
df_new.head()

In [None]:
df_new_with_com = pd.merge(df_new, entity_community_df, left_index=True, right_index=True)
df_new_with_com = df_new_with_com.reset_index(drop=True)

In [None]:

corr_list = list()
column_l = []
for column in df_new_with_com.columns:
    try:
        b = df_new_with_com[column].corr(df_new_with_com[0],method='spearman')
        corr_list.append(b)
        column_l.append(column)
    except:
        pass

corr_df = pd.concat([pd.Series(column_l),pd.Series(corr_list)],axis=1).fillna(0)
corr_df.columns  = ['column_name','Correlation']
corr_df['direction'] = np.where(corr_df['Correlation']>0,'Positive','Negative')
corr_df=corr_df.sort_values(by='Correlation',ascending = False)
corr_df

In [None]:
for i in range(len(coms.communities)):
    print(f"******For community {i}*******")
    id= df_new_with_com[df_new_with_com[0] == i].index.tolist()
    row=[]
    for i in list(df_new_with_com.iloc[id]):
        try:
            row.append([df_new_with_com[i].iloc[id].value_counts()[df_new_with_com[i].iloc[id].value_counts()/len(df_new_with_com[i].iloc[id])>0.8].iloc[0]/len(id),df_new_with_com[i].iloc[id].value_counts()[df_new_with_com[i].iloc[id].value_counts()/len(df_new_with_com[i].iloc[id])>0.8].index[0],i])
        except:
            pass
    dis=pd.DataFrame(row,columns=["percentage","value","Column"])
    print(dis)

In [None]:
node_embedding_df_withcom = pd.merge(node_embedding_df, entity_community_df, left_index=True, right_index=True)
node_embedding_df_withcom = node_embedding_df_withcom.drop(columns='0_x')
node_embedding_df_withcom

In [None]:
pca = PCA(n_components=64)
transformed_df = pca.fit_transform(node_embedding_df_withcom.drop(columns=['0_y']))

tsne = TSNE(n_components=2,n_jobs=-1)
tsne2d = tsne.fit_transform(transformed_df)

coordinates_per_entity = pd.DataFrame(tsne2d,columns=['x','y'] )

sns.scatterplot(tsne2d[:,0],tsne2d[:,1],hue = node_embedding_df_withcom['0_y'],palette='Set3')
plt.show()

score = metrics.silhouette_score(coordinates_per_entity, node_embedding_df_withcom['0_y'] ,metric='euclidean')
print(score)

In [None]:
coordinates_per_entity.index =node_id_lookup
coordinates_per_entity

In [None]:
pca = PCA(n_components=32)
transformed_df = pca.fit_transform(node_embedding_df_withcom.drop(columns=['0_y']))

tsne = TSNE(n_components=2,n_jobs=-1)
tsne2d = tsne.fit_transform(transformed_df)

coordinates_per_entity = pd.DataFrame(tsne2d,columns=['x','y'] )

sns.scatterplot(tsne2d[:,0],tsne2d[:,1],hue = node_embedding_df_withcom['0_y'],palette='Set3')
plt.show()

score = metrics.silhouette_score(coordinates_per_entity, node_embedding_df_withcom['0_y'] ,metric='euclidean')
print(score)

In [None]:
pca = PCA(n_components=16)
transformed_df = pca.fit_transform(node_embedding_df_withcom.drop(columns=['0_y']))

tsne = TSNE(n_components=2,n_jobs=-1)
tsne2d = tsne.fit_transform(transformed_df)

coordinates_per_entity = pd.DataFrame(tsne2d,columns=['x','y'] )

sns.scatterplot(tsne2d[:,0],tsne2d[:,1],hue = node_embedding_df_withcom['0_y'],palette='Set3')
plt.show()

score = metrics.silhouette_score(coordinates_per_entity, node_embedding_df_withcom['0_y'] ,metric='euclidean')
print(score)

In [None]:
pca = PCA(n_components=8)
transformed_df = pca.fit_transform(node_embedding_df_withcom.drop(columns=['0_y']))

tsne = TSNE(n_components=2,n_jobs=-1)
tsne2d = tsne.fit_transform(transformed_df)

coordinates_per_entity = pd.DataFrame(tsne2d,columns=['x','y'] )

sns.scatterplot(tsne2d[:,0],tsne2d[:,1],hue = node_embedding_df_withcom['0_y'],palette='Set3')
plt.show()

score = metrics.silhouette_score(coordinates_per_entity, node_embedding_df_withcom['0_y'] ,metric='euclidean')
print(score)

In [None]:
pca = PCA(n_components=4)
transformed_df = pca.fit_transform(node_embedding_df_withcom.drop(columns=['0_y']))

tsne = TSNE(n_components=2,n_jobs=-1)
tsne2d = tsne.fit_transform(transformed_df)

coordinates_per_entity = pd.DataFrame(tsne2d,columns=['x','y'] )

sns.scatterplot(tsne2d[:,0],tsne2d[:,1],hue = node_embedding_df_withcom['0_y'],palette='Set3')
plt.show()

score = metrics.silhouette_score(coordinates_per_entity, node_embedding_df_withcom['0_y'] ,metric='euclidean')
print(score)

In [None]:
pca = PCA(n_components=2)
transformed_df = pca.fit_transform(node_embedding_df_withcom.drop(columns=['0_y']))

coordinates_per_entity = pd.DataFrame(transformed_df,columns=['x','y'] )

sns.scatterplot(transformed_df[:,0],transformed_df[:,1],hue = node_embedding_df_withcom['0_y'],palette='Set3')
plt.show()

score = metrics.silhouette_score(coordinates_per_entity, node_embedding_df_withcom['0_y'] ,metric='euclidean')
print(score)

In [None]:
tsne = TSNE(n_components=2,n_jobs=-1,perplexity=50)
tsne2d = tsne.fit_transform(node_embedding_df_withcom.drop(columns=['0_y']))

coordinates_per_entity = pd.DataFrame(tsne2d,columns=['x','y'] )

sns.scatterplot(tsne2d[:,0],tsne2d[:,1],hue = node_embedding_df_withcom['0_y'],palette='Set3')
plt.show()

score = metrics.silhouette_score(coordinates_per_entity, node_embedding_df_withcom['0_y'] ,metric='euclidean')
print(score)

In [None]:
tsne = TSNE(n_components=2,n_jobs=-1,perplexity=100,verbose=1,)
tsne2d = tsne.fit_transform(node_embedding_df_withcom.drop(columns=['0_y']))

coordinates_per_entity = pd.DataFrame(tsne2d,columns=['x','y'] )

sns.scatterplot(tsne2d[:,0],tsne2d[:,1],hue = node_embedding_df_withcom['0_y'],palette='Set3')
plt.show()

score = metrics.silhouette_score(coordinates_per_entity, node_embedding_df_withcom['0_y'] ,metric='euclidean')
print(score)

In [None]:
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne2d = tsne_model.fit_transform(node_embedding_df_withcom.drop(columns=['0_y']))

coordinates_per_entity = pd.DataFrame(tsne2d,columns=['x','y'] )

sns.scatterplot(tsne2d[:,0],tsne2d[:,1],hue = node_embedding_df_withcom['0_y'],palette='Set3')
plt.show()

score = metrics.silhouette_score(coordinates_per_entity, node_embedding_df_withcom['0_y'] ,metric='euclidean')
print(score)


In [None]:
node_embedding_df_withcom

In [None]:
x = tsne2d[:,0]
y = tsne2d[:,1]

plt.figure(figsize=(16, 16)) 
for i in range(len(node_embedding_df_withcom)):
    plt.scatter(x[i],y[i])
    plt.annotate(node_embedding_df_withcom.index[i],
                 xy=(x[i], y[i]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
plt.show()

In [None]:
### use this to optize https://towardsdatascience.com/how-to-use-dbscan-effectively-ed212c02e62
from sklearn.cluster import DBSCAN

fig = plt.figure(figsize=(20, 10))
fig.subplots_adjust(hspace=.5, wspace=.2)
i = 1
for x in range(10, 0, -1):
    eps = 1/(11-x)
    db = DBSCAN(eps=eps, min_samples=50).fit(node_embedding_df_withcom.drop(columns=['0_y']))
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
    print(eps)
    ax = fig.add_subplot(2, 5, i)
    ax.text(1, 4, "eps = {}".format(round(eps, 1)), fontsize=25, ha="center")
    sns.scatterplot(tsne2d[:,0], tsne2d[:,1], hue=["cluster-{}".format(x) for x in labels])
    
    i += 1

In [None]:
# Load data
df_new = pd.read_csv(dataset_name)
entity_column='entity_column'
datetime_columns ='datetime_columns'
df_new['entity_column']= df_new.index.tolist()
df_new['datetime_columns']= df_new.index.tolist()    
df_new[entity_column] = 'entity_id_' + df_new[entity_column].astype(str) + '__' + df_new[datetime_columns].astype(str)
df_new.index = df_new[entity_column]
df_new.head()

# Rules using Decision Tree

In [None]:
df_new = pd.merge(df_new, entity_community_df, left_index=True, right_index=True)
df_new.head()

In [None]:
X = df_new.drop(columns=[entity_column,target_column,datetime_columns])
y = df_new[target_column]

# Label Encoding of Categorical Columns
categorical_names = {}
for feature in X.columns:
    X[feature] =  X[feature].fillna('') 
    le =  LabelEncoder()
    le.fit(X[feature])
    X[feature] = le.transform(X[feature])
    categorical_names[feature] = le.classes_

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=42,stratify=y)

rf = RandomForestClassifier(criterion='entropy', class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)

predictions = rf.predict_proba(X_test)
predictions_class = rf.predict(X_test)

if df_new[target_column].nunique()> 2:
    rules_score2 = roc_auc_score(y_test, predictions[:,1], multi_class = 'ovo')
else:
    rules_score2 = roc_auc_score(y_test, predictions[:,1])

print(rules_score2)
print(f'Change in the ROC-AUC {(rules_score2 - rules_score)*100}')

#TODO : Update the communities based on the no of clusters
rules_using_dt = get_rules(rf.estimators_[5], X.columns.tolist(),categorical_names, ['community_0','community_1','community_2','community_3'])

In [None]:

cf_matrix = confusion_matrix(y_test, predictions_class)
print(cf_matrix)
sns.heatmap(cf_matrix, annot=True)

In [None]:
[[923   2]
 [160  18]]

In [None]:
rules_using_dt

In [None]:
import sklearn
import graphviz

# draw model
dot_data = sklearn.tree.export_graphviz(rf.estimators_[5], out_file=None, filled=True, rounded=True, special_characters=True)  
graph = graphviz.Source(dot_data)  
graph

# The End