In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm

In [2]:
DATADIR = '../hike/'

In [3]:
train = pd.read_csv(DATADIR + 'train.csv', usecols=['node1_id', 'node2_id'])

In [4]:
test = pd.read_csv(DATADIR + 'test.csv', usecols=['node1_id', 'node2_id'])

In [5]:
train.shape, test.shape

((70661802, 2), (11776968, 2))

In [6]:
df = pd.concat([train, test], axis=0)

In [7]:
df.shape

(82438770, 2)

In [8]:
del train, test

#### make graph for all nodes and create neighbours dict

In [None]:
%%time
graph = nx.from_pandas_edgelist(df=df, source='node1_id', target='node2_id')

In [9]:
%%time
neighbour = {node:[n for n in graph.neighbors(node)] for node in graph.nodes}

CPU times: user 6min 42s, sys: 2.5 s, total: 6min 45s
Wall time: 6min 45s


#### make graph for chat nodes only and create neighbours dict for the same

In [17]:
train = pd.read_csv(DATADIR + 'train.csv')

In [18]:
%%time
chat=train[train.is_chat==1]
chat_graph = nx.from_pandas_edgelist(df=chat.iloc[:,:2],source='node1_id',target='node2_id')

CPU times: user 27.5 s, sys: 1.21 s, total: 28.7 s
Wall time: 28.7 s


In [20]:
%%time
is_chat_neighbour = {node:[n for n in chat_graph.neighbors(node)] for node in chat_graph.nodes}

CPU times: user 18.4 s, sys: 90.8 ms, total: 18.5 s
Wall time: 18.5 s


In [75]:
# del train

In [17]:
print (len(is_chat_neighbour.keys()))

1899068


In [18]:
print (len(neighbour.keys()))

8264276


#### saving neighbours dict for reloading purpose

In [9]:
# import pickle

In [14]:
# with open(r"neighbour_all_pat.pickle", "wb") as output_file:
#     pickle.dump(neighbour, output_file)

In [15]:
# with open(r"neighbour_chat_pat.pickle", "wb") as output_file:
#     pickle.dump(is_chat_neighbour, output_file)

#### clear graph from memory

In [16]:
# graph.clear()

In [None]:
# chat_graph.clear()

In [None]:
# del graph, chat_graph

#### load neighbours dict from pickle

In [11]:
# import pickle

In [12]:
# neighbour = pickle.load(open("neighbour_all_pat.pickle"))

In [13]:
# is_chat_neighbour = pickle.load(open("neighbour_chat_pat.pickle"))

The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.


####  creating neighbours variables for all edges (train+test)

In [None]:
%%time
with open('neigbours_vars.csv', 'w') as myfile:
    for i, row in df.iterrows():
        a, b = row['node1_id'], row['node2_id']
        neighbour_a, neighbour_b = set(neighbour[int(a)]), set(neighbour[int(b)])
        try:
            chat_neighbour_a=set(is_chat_neighbour[int(a)])
        except:
            chat_neighbour_a=set()
        try:
            chat_neighbour_b=set(is_chat_neighbour[int(b)])
        except:
            chat_neighbour_b=set()
            
        if a in neighbour_a: neighbour_a.remove(a)
        if b in neighbour_a: neighbour_a.remove(b)
        if a in neighbour_b:neighbour_b.remove(a)
        if b in neighbour_b:neighbour_b.remove(b)

        if a in chat_neighbour_a: chat_neighbour_a.remove(a)
        if b in chat_neighbour_a: chat_neighbour_a.remove(b)
        if a in chat_neighbour_b:chat_neighbour_b.remove(a)
        if b in chat_neighbour_b:chat_neighbour_b.remove(b)
            
        na_n_c_n = [len(is_chat_neighbour.get(node, [])) for node in neighbour_a]
        nb_n_c_n = [len(is_chat_neighbour.get(node, [])) for node in neighbour_b]
        
        mutual_n_c_n = [len(is_chat_neighbour.get(node, [])) for node in neighbour_a.intersection(neighbour_b)]
        all_n_c_n = [len(is_chat_neighbour.get(node, [])) for node in neighbour_a.union(neighbour_b)]
        
        mutual_chat_n_c_n = [len(is_chat_neighbour.get(node, [])) for node in chat_neighbour_a.intersection(chat_neighbour_b)]
        all_chat_n_c_n = [len(is_chat_neighbour.get(node, [])) for node in chat_neighbour_a.union(chat_neighbour_b)]
        
        myfile.write("{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format(
                len(neighbour_a),
                len(neighbour_b),
                len(chat_neighbour_a),
                len(chat_neighbour_b),
                len(neighbour_a.intersection(neighbour_b)),
                len(chat_neighbour_a.intersection(chat_neighbour_b)),
                len(chat_neighbour_a.intersection(neighbour_a.intersection(neighbour_b))),
                len(chat_neighbour_b.intersection(neighbour_a.intersection(neighbour_b))),
            
                sum(na_n_c_n),
                len(na_n_c_n),
                sum(nb_n_c_n),
                len(nb_n_c_n),
                sum(mutual_n_c_n),
                len(mutual_n_c_n),
                sum(all_n_c_n),
                len(all_n_c_n),
                sum(mutual_chat_n_c_n),
                len(mutual_chat_n_c_n),
                sum(all_chat_n_c_n),
                len(all_chat_n_c_n),
        ))


In [47]:
all_neighbour = pd.read_csv('neigbours_vars.csv', header=None)

In [67]:
df_deg_2_neigh = pd.DataFrame()
df_deg_2_neigh['degree_2_neighs_chat_sum_source'] = all_neighbour[8]
df_deg_2_neigh['degree_2_neighs_chat_avg_source'] = all_neighbour[8]/all_neighbour[9]
df_deg_2_neigh['degree_2_neighs_chat_sum_target'] = all_neighbour[10]
df_deg_2_neigh['degree_2_neighs_chat_avg_target'] = all_neighbour[10]/all_neighbour[11]
df_deg_2_neigh['mutual_neighs_avg_chat_sum'] = all_neighbour[12]
df_deg_2_neigh['mutual_neighs_avg_chat_avg'] = all_neighbour[12]/all_neighbour[13]
df_deg_2_neigh['union_neighs_avg_chat_sum'] = all_neighbour[14]
df_deg_2_neigh['union_neighs_avg_chat_avg'] = all_neighbour[14]/all_neighbour[15]
df_deg_2_neigh.head()


Unnamed: 0,degree_2_neighs_chat_sum_source,degree_2_neighs_chat_avg_source,degree_2_neighs_chat_sum_target,degree_2_neighs_chat_avg_target,mutual_neighs_avg_chat_sum,mutual_neighs_avg_chat_avg,union_neighs_avg_chat_sum,union_neighs_avg_chat_avg
0,26,0.702703,11,1.571429,1,1.0,36,0.837209
1,37,0.787234,10,0.909091,0,,47,0.810345
2,68,1.478261,44,2.095238,7,2.333333,105,1.640625
3,101,1.77193,122,1.794118,20,2.857143,203,1.720339
4,47,0.691176,7,0.7,0,0.0,54,0.701299


In [69]:
df_deg_2_neigh.to_pickle('degree_2_neighbour_feats.pkl')

In [63]:
df_neighbours1 = all_neighbour[[4,6,7]]
df_neighbours1.head()

Unnamed: 0,4,6,7
0,1,0,0
1,0,0,0
2,3,0,0
3,7,0,0
4,1,0,0


In [71]:
df_neighbours1.to_csv('neigbours_vars_sahil_1.csv', index=False)

In [72]:
df_neighbours2 = all_neighbour[[2,3]]
df_neighbours2.head()

Unnamed: 0,2,3
0,0,0
1,0,0
2,1,4
3,0,0
4,2,0


In [73]:
df_neighbours2.to_csv('neigbours_vars_sahil_2.csv', index=False)

In [55]:
df_leftover = pd.DataFrame()
df_leftover[0] = all_neighbour[16]
df_leftover[1] = all_neighbour[16]/all_neighbour[17]
df_leftover[2] = all_neighbour[18]
df_leftover[3] = all_neighbour[18]/all_neighbour[19]

In [56]:
df_leftover.head()

Unnamed: 0,0,1,2,3
0,0,,0,
1,0,,0,
2,0,,23,4.6
3,0,,0,
4,0,,5,2.5


In [77]:
df_leftover.to_pickle('neigbours_vars_pat_leftover_2.pkl')