In [36]:
import pickle
import copy
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import networkx as nx

In [37]:
fp = open('./data/twitch_missing_edges_final.p', "rb")
missing_edges = pickle.load(fp)
type(missing_edges)

set

In [38]:
df_neg = pd.DataFrame(list(missing_edges), columns=['Source', 'Destination'])
print(df_neg.shape)
df_neg.head(1)

(35324, 2)


Unnamed: 0,Source,Destination
0,58556,32509


In [39]:
df_pos = pd.read_csv('./data/musae_ENGB_edges.csv')
df_pos = df_pos.rename(columns = {'from':'Source', 'to':'Destination'})
df_pos = df_pos.drop_duplicates()
print(df_pos.shape)
df_pos.head(1)

(35324, 2)


Unnamed: 0,Source,Destination
0,6194,255


In [40]:
df_pos['Class'] = 1
df_pos.head(5)

Unnamed: 0,Source,Destination,Class
0,6194,255,1
1,6194,980,1
2,6194,2992,1
3,6194,2507,1
4,6194,986,1


In [41]:
df_neg['Class'] = 0
df_neg.head(5)

Unnamed: 0,Source,Destination,Class
0,58556,32509,0
1,43637,73648,0
2,21645,56924,0
3,73623,64638,0
4,60693,64149,0


In [42]:
frames = [df_pos, df_neg]
df = pd.concat(frames)
print(df.shape)
df.head(5)

(70648, 3)


Unnamed: 0,Source,Destination,Class
0,6194,255,1
1,6194,980,1
2,6194,2992,1
3,6194,2507,1
4,6194,986,1


In [43]:
df.to_csv('./data/twitch_subset.csv')

In [44]:
df_copy = copy.deepcopy(df)

## Feature Extraction

In [45]:
g = nx.from_pandas_edgelist(df[['Source','Destination']], source='Source', target='Destination',create_using=nx.DiGraph())
print(nx.info(g))

Name: 
Type: DiGraph
Number of nodes: 50239
Number of edges: 70648
Average in degree:   1.4062
Average out degree:   1.4062


In [46]:
# Page Rank
pr = nx.pagerank(g)
df['Page_Rank_Src'] = df.Source.apply(lambda row: pr.get(row))
df['Page_Rank_Dst'] = df.Destination.apply(lambda row: pr.get(row))

In [47]:
# Shortest Path
def get_shortest_path(x, y):
    d = -1
    try:
        if g.has_edge(x, y):
            g.remove_edge(x, y)
            d = nx.shortest_path_length(g, source=x, target=y)
            g.add_edge(x, y)
        else:
            d = nx.shortest_path_length(g, source=x, target=y)
    except:
        d = -1
    return d

df['Shortest_Path'] = df.apply(lambda row: get_shortest_path(row['Source'], row['Destination']), axis = 1)

In [48]:
# Follow Features
followers_src, followers_dst, followees_src, followees_dst, int_followers, int_followees = [], [], [], [], [], []

for i, r in df.iterrows():
    pre_src = set(g.predecessors(r['Source'])) if set(g.predecessors(r['Source'])) else set()
    suc_src = set(g.successors(r['Source'])) if set(g.successors(r['Source'])) else set()

    pre_dst = set(g.predecessors(r['Destination'])) if set(g.predecessors(r['Destination'])) else set()
    suc_dst = set(g.successors(r['Destination'])) if set(g.successors(r['Destination'])) else set()

    followers_src.append(len(pre_src))
    followees_src.append(len(suc_src))

    followers_dst.append(len(pre_dst))
    followees_dst.append(len(suc_dst))

    int_followers.append(len(pre_src.intersection(pre_dst)))
    int_followees.append(len(suc_src.intersection(suc_dst)))
        
df['Followers_Src'] = followers_src
df['Followees_Src'] = followees_src
df['Followers_Dst'] = followers_dst
df['Followees_Dst'] = followees_dst
df['Int_Followers'] = int_followers
df['Int_Followees'] = int_followees

In [49]:
df.head(5)

Unnamed: 0,Source,Destination,Class,Page_Rank_Src,Page_Rank_Dst,Shortest_Path,Followers_Src,Followees_Src,Followers_Dst,Followees_Dst,Int_Followers,Int_Followees
0,6194,255,1,2.3e-05,1.3e-05,-1,0,3,0,3,0,0
1,6194,980,1,2.3e-05,1.4e-05,-1,0,3,0,15,0,1
2,6194,2992,1,2.3e-05,2.4e-05,4,0,3,2,0,0,0
3,6194,2507,1,2.3e-05,2e-05,3,0,3,8,13,0,0
4,6194,986,1,2.3e-05,2.9e-05,-1,0,3,0,20,0,0


In [50]:
df.to_csv('./data/twitch_final_dataset.csv')