In [None]:
import pandas as pd
import numpy as np
import networkx as nx
pd.options.display.max_columns=1000

In [None]:
DATA_DIR = '../hike/'

In [None]:
train = pd.read_csv(DATA_DIR + 'train.csv', usecols=['node1_id','node2_id'], dtype={"is_chat": np.int8})

In [None]:
test = pd.read_csv(DATA_DIR + 'test.csv', usecols=['node1_id','node2_id'], dtype={"is_chat": np.int8})

In [None]:
train.shape, test.shape

In [None]:
df = pd.concat([train, test], axis=0)

In [None]:
del train, test

In [None]:
%%time
graph = nx.from_pandas_edgelist(df=df, source='node1_id', target='node2_id')

#### Creating JC, RSA, PA, AA values for undirected graph 

In [None]:
%%time
with open( "jc_rsa_pa_aai.csv", "w") as myfile:
    myfile.write("jc,rsa,pa,aa\n")
    for i, row in df.iterrows():
        a, b = row['node1_id'], row['node2_id']
        jc = nx.jaccard_coefficient(G=graph, ebunch=[(a,b)]).next()[2]
        rsa = nx.resource_allocation_index(G=graph, ebunch=[(a,b)]).next()[2]
        pa = nx.preferential_attachment(G=graph,ebunch=[(a,b)]).next()[2]
        try:
            aai = nx.adamic_adar_index(G=graph,ebunch=[(a,b)]).next()[2]
        except:
            aai = ''
        myfile.write("{},{},{},{}\n".format(jc, rsa, pa, aai))    


#### Creating triangles features

In [None]:
%%time
triangles=nx.algorithms.cluster.triangles(graph)

In [None]:
tri=pd.DataFrame(triangles.items(),columns=['node_id','num_triangle'])

In [None]:
%%time
df_triangles=df.merge(tri,left_on='node1_id',right_on='node_id',how='left')
df_triangles.drop('node_id',axis=1,inplace=True)
df_triangles.rename(columns={'num_triangle':'triangles_source'},inplace=True)


df_triangles=df_triangles.merge(tri,left_on='node2_id',right_on='node_id',how='left')
df_triangles.drop('node_id',axis=1,inplace=True)
df_triangles.rename(columns={'num_triangle':'triangles_target'},inplace=True)

df_triangles.triangles_source=df_triangles.triangles_source.astype('int16')
df_triangles.triangles_target=df_triangles.triangles_target.astype('int16')

df_triangles.iloc[:,2:].to_pickle('triangles.pkl')


In [48]:
df_triangles.head()

Unnamed: 0,node1_id,node2_id,triangles_source,triangles_target
0,8446602,6636127,18,1
1,1430102,7433949,63,2
2,2803017,8372333,32,40
3,4529348,894645,62,193
4,5096572,4211638,45,5


#### Creating clusters deatures 

In [None]:
clusters = nx.clustering(graph)

In [40]:
clust = pd.DataFrame(clusters.items(), columns=['node_id', 'clust_coeff'])

In [41]:
%%time
df_cluster=df.merge(clust,left_on='node1_id',right_on='node_id',how='left')
df_cluster.drop('node_id',axis=1,inplace=True)
df_cluster.rename(columns={'clust_coeff':'clust_source'},inplace=True)

df_cluster=df_cluster.merge(clust,left_on='node2_id',right_on='node_id',how='left')
df_cluster.drop('node_id',axis=1,inplace=True)
df_cluster.rename(columns={'clust_coeff':'clust_target'},inplace=True)

df_cluster.clust_source=df_cluster.clust_source.astype('float16')
df_cluster.clust_target=df_cluster.clust_target.astype('float16')
df_cluster.iloc[:,2:].to_pickle('cluster_coeffs.pkl')

CPU times: user 3.63 s, sys: 1.49 s, total: 5.11 s
Wall time: 5.11 s


In [42]:
df_cluster.head()

Unnamed: 0,node1_id,node2_id,clust_source,clust_target
0,8446602,6636127,0.025604,0.035706
1,1430102,7433949,0.055847,0.030304
2,2803017,8372333,0.029602,0.173218
3,4529348,894645,0.037506,0.082275
4,5096572,4211638,0.01918,0.090881


#### Creating degree feature

In [None]:
undirected_degree = nx.algorithms.degree_centrality(G=graph)

In [None]:
with open('degrees_contact.pkl', 'wb') as output_file:
    pickle.dump(undirected_degree, output_file)