In [1]:
import nltk
from multiprocessing import cpu_count, Pool
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()
import pandas as pd
import numpy as np
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from utils import common, overlap, loop, parallel_loop, overlap_df
import igraph as ig
import networkx as nx
from keras.preprocessing import text, sequence
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rkroc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### helper functions

In [None]:
def Jacc_Similarity(row, field):
    text1 = row[field+'_target'].lower()
    text2 = row[field+'_source'].lower()
    intersection = set(text1).intersection(set(text2))
    union = set(text1).union(set(text2))
    return float(len(intersection)) / len(union)

def overlap(row, field):
    text1 = row[field+'_target']
    text2 = row[field+'_source']
    text1 = stop_words_stems(text1)
    text2 = stop_words_stems(text2)
    overlap = len(set(text1).intersection(set(text2)))
    return overlap

def overlap_df(df, name='Overlap_title', field='Title'):
    df[name] = df.apply(lambda row : overlap(row, field))
    return df


def loop(df, f, field):
    l=[]
    le = len(df)
    for index, row in df.iterrows():
        l.append(f(row, field))
        if index%10000==0:
            print(index, le)
    return f

def parallel_loop(df, f):
    partitions = 2
    data_split = np.array_split(df, partitions)
    pool = Pool(2)
    df = pd.concat(pool.map(f, data_split))
    pool.close()
    pool.join()
    return df
    

def common(row, field='Authors'):
    text1 = row[field+'_target']
    text2 = row[field+'_source']
    if text1!=text1 or text2!=text2:
        return 0
    text1 = text1.split(",")
    text2 = text2.split(",")
    common = len(set(text1).intersection(set(text2)))
    return common
    
def stop_words_stems(txt):
    txt = txt.split(",")
    txt = [token for token in txt if token not in stpwds]
    txt = [stemmer.stem(token) for token in txt]

In [2]:
node_information = pd.read_csv('node_information.csv', header=None, names=['ID', 'Year', 'Title', 'Authors', 'Journal', 'Abstract'])
training_set = pd.read_csv('training_set.txt', header=None, names=['Target', 'Source', 'Edge'], delim_whitespace=True)
testing_set = pd.read_csv('testing_set.txt', header=None, names=['Target', 'Source', 'Edge'], delim_whitespace=True)

In [3]:
node_information.fillna('',inplace=True)
node_information.isnull().sum()

ID          0
Year        0
Title       0
Authors     0
Journal     0
Abstract    0
dtype: int64

In [4]:
testing_set

Unnamed: 0,Target,Source,Edge
0,9807076,9807139,
1,109162,1182,
2,9702187,9510135,
3,111048,110115,
4,9910176,9410073,
...,...,...,...
32643,9705209,9305083,
32644,9307023,9503118,
32645,9608095,9205058,
32646,9407008,106256,


##### basic text features already created can be ignored

In [30]:
def get_data():
    print("Get valid IDs")
    valid_ids=set()
    for element in training_set.values:
        valid_ids.add(element[0])
        valid_ids.add(element[1])
        
    print("Select valid indices from valid IDs")
    index_valid=[i for i, element in enumerate(node_information.values) if element[0] in valid_ids]
    node_info=node_information.iloc[index_valid]
    
    print("Get index for nodes")
    IDs = []
    ID_pos={}
    for element in node_info.values:
        ID_pos[element[0]]=len(IDs)
        IDs.append(element[0])
        
    print("Add ID column for merging")
    training_set['Target_ID']= training_set.apply(lambda row : ID_pos[row[0]], axis=1)#
    training_set['Source_ID']= training_set.apply(lambda row : ID_pos[row[1]], axis=1)#
    
    print("Merge")
    train = pd.merge(training_set, node_information, how='left', left_on='Target_ID', right_index=True)
    train = pd.merge(train, node_information, how='left', left_on='Source_ID', right_index=True, suffixes=['_target', '_source'])
    train.to_csv('train_basic_tfidfs.csv', index=False)
   
    
    train['Overlap_title'] = train.apply(lambda row :overlap(row, 'Title'), axis=1)#
 
    train.to_csv('train_basic_tfidfs.csv', index=False)
    train['Overlap_abstract'] = train.apply(lambda row :overlap(row, 'Abstract'), axis=1)#
    
    train.to_csv('train_basic_tfidfs.csv', index=False)
    train['Overlap_journal'] = train.apply(lambda row :overlap(row, 'Journal'), axis=1)#
    
    train.to_csv('train_basic_tfidfs.csv', index=False)
    train['Common_authors'] = train.apply(lambda row :common(row, 'Authors'), axis=1)

    train.to_csv('train_basic_tfidfs.csv', index=False)

    train['Common_authors_prob'] = train.apply(lambda row :Jacc_Similarity(row, 'Authors'), axis=1)
    print(time()-t)
    train.to_csv('train_basic_tfidfs.csv', index=False)
    train['Common_journal_prob'] = train.apply(lambda row :Jacc_Similarity(row, 'Journal'), axis=1)#
    train.to_csv('train_basic_tfidfs.csv', index=False)
    train['Date_diff'] = (train['Year_source']-train['Year_target']).abs()#
    train.to_csv('train_basic_tfidfs.csv', index=False)
   
    return train  

In [None]:
train_semantics = get_data()

In [39]:
def get_test_data():
    
    print("Get valid IDs")
    valid_ids=set()
    for element in training_set.values:
        valid_ids.add(element[0])
        valid_ids.add(element[1])
        
    print("Select valid indices from valid IDs")
    index_valid=[i for i, element in enumerate(node_information.values) if element[0] in valid_ids]
    node_info=node_information.iloc[index_valid]
    
    print("Get index for nodes")
    IDs = []
    ID_pos={}
    for element in node_info.values:
        ID_pos[element[0]]=len(IDs)
        IDs.append(element[0])
        
    print("Add ID column for merging")
    testing_set['Target_ID']= testing_set.apply(lambda row : ID_pos[row[0]], axis=1)#
    testing_set['Source_ID']= testing_set.apply(lambda row : ID_pos[row[1]], axis=1)#
    
    print("Merge")
    test = pd.merge(testing_set, node_information, how='left', left_on='Target_ID', right_index=True)
    test = pd.merge(test, node_information, how='left', left_on='Source_ID', right_index=True, suffixes=['_target', '_source'])
    test.to_csv('test_basic_tfidfs.csv', index=False)
    
    t = time()
    
    test['Overlap_title'] = test.apply(lambda row :overlap(row, 'Title'), axis=1)#
    test['Overlap_abstract'] = test.apply(lambda row :overlap(row, 'Abstract'), axis=1)#
    test['Overlap_journal'] = test.apply(lambda row :overlap(row, 'Journal'), axis=1)#
    
    test['Common_authors'] = test.apply(lambda row :common(row, 'Authors'), axis=1)
    test['Common_authors_prob'] = test.apply(lambda row :Jacc_Similarity(row, 'Authors'), axis=1)
    test['Common_journal_prob'] = test.apply(lambda row :Jacc_Similarity(row, 'Journal'), axis=1)#
    
    test['Date_diff'] = (test['Year_source']-test['Year_target']).abs()#
    print(time()-t)
    test.to_csv('test_basic_tfidfs.csv', index=False)
    
    

    return test

In [40]:
test_semantics = get_test_data()

Get valid IDs
Select valid indices from valid IDs
Get index for nodes
Add ID column for merging
Merge
14.434291362762451


In [45]:
test_semantics.columns

Index(['Target', 'Source', 'Edge', 'Target_ID', 'Source_ID', 'ID_target',
       'Year_target', 'Title_target', 'Authors_target', 'Journal_target',
       'Abstract_target', 'ID_source', 'Year_source', 'Title_source',
       'Authors_source', 'Journal_source', 'Abstract_source', 'Overlap_title',
       'Overlap_abstract', 'Overlap_journal', 'Common_authors',
       'Common_authors_prob', 'Common_journal_prob', 'Date_diff'],
      dtype='object')

In [50]:
train_semantics = train.copy()

# Additional_Features

In [78]:
def create_graph(X, y):
    graph = nx.Graph()
    edges=[]
    nodes=set()
    for i in range(len(X)):
        source = X[i][0]
        target = X[i][1]
        nodes.add(source)
        nodes.add(target)
        if y[i]==1:
            edges.append((source, target))
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)
    return graph

def create_directed_graph(X, y):
    graph = nx.DiGraph()
    edges=[]
    nodes=set()
    for i in range(len(X)):
        source = X[i][0]
        target = X[i][1]
        nodes.add(source)
        nodes.add(target)
        if y[i]==1:
            edges.append((source, target))
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)
    return graph

def vertex_degree(graph, v):
    return graph.degree(v)

def count(graph, nodes):
    c=0
    for node in nodes:
        for node_ in nodes:
            if graph.has_edge(node, node_):
                c+=1
    return c
def subgraphs_edge_number(graph, v):
    #neighbors = graph.neighbors(v)
    neighbors = list(graph[v].keys())
    neighbors_plus = neighbors + [v]
    subgraph = graph.subgraph(neighbors)
    subgraph_plus = graph.subgraph(neighbors_plus)
    sub_edge_num = subgraph.number_of_edges()
    #sub_edge_num = subgraph.size()
    sub_edge_num_plus = subgraph_plus.number_of_edges()
    return sub_edge_num, sub_edge_num_plus

def all_vertex(graph, v):
    neighbors = list(graph[v].keys())
    neighbors_plus = neighbors + [v]
    subgraph = graph.subgraph(neighbors)
    subgraph_plus = graph.subgraph(neighbors_plus)
    sub_edge_num = subgraph.number_of_edges()
    sub_edge_num_plus = subgraph_plus.number_of_edges()
    #sub_edge_num = count(graph, neighbors)
    #sub_edge_num_plus = count(graph, neighbors_plus)
    return graph.degree(v), sub_edge_num, sub_edge_num_plus

def common_friends(graph, u, v):
    return len(nx.common_neighbors(graph, u, v))

def total_friends(graph, u, v):
    neighbors_u = list(graph[u].keys())
    neighbors_v = list(graph[v].keys())
    total = list(set(neighbors_u).union(neighbors_v))
    return len(total)

def friends_measure(graph, u, v):
    neighbors_u = list(graph[u].keys())
    neighbors_v = list(graph[v].keys())
    c=0
    for n_u in neighbors_u:
        for n_v in neighbors_v:
            if graph.has_edge(n_u, n_v) or graph.has_edge(n_v, n_u):
                c+=1
                
def subgraph_features(graph, u, v):
    neighbors_u = list(graph[u].keys())
    neighbors_v = list(graph[v].keys())
    neighbors_u_plus = neighbors_u + [u]
    neighbors_v_plus = neighbors_v + [v]
    nh = list(set(neighbors_u).union(neighbors_v))
    nh_plus = list(set(neighbors_u_plus).union(neighbors_v_plus))
    sub_nh = graph.subgraph(nh)
    sub_nh_plus = graph.subgraph(nh_plus)
    return sub_nh.number_of_edges(), sub_nh_plus.number_of_edges()

def shortest_path(graph, u, v):
    return nx.shortest_path_length(graph, u, v)

def all_edges(graph, u, v):
    common_friends = len(list(nx.common_neighbors(graph, u, v)))
    neighbors_u = list(graph[u].keys())
    neighbors_v = list(graph[v].keys())
    nh = list(set(neighbors_u).union(neighbors_v))
    total_friends = len(nh)
    friends_measure=0
    for n_u in neighbors_u:
        for n_v in neighbors_v:
            if graph.has_edge(n_u, n_v) or graph.has_edge(n_v, n_u):
                friends_measure+=1
    neighbors_u_plus = neighbors_u + [u]
    neighbors_v_plus = neighbors_v + [v]
    nh_plus = list(set(neighbors_u_plus).union(neighbors_v_plus))
    sub_nh = graph.subgraph(nh)
    sub_nh_plus = graph.subgraph(nh_plus)
    if not nx.has_path(graph, v, u):
        len_path=-1
    else:
        len_path = nx.shortest_path_length(graph, v, u)
    return common_friends, total_friends, friends_measure, sub_nh.number_of_edges(), sub_nh_plus.number_of_edges(), len_path

def generate_vertex(graph, fs, X, len_fs=3):
    l = X.shape[0]
    feat_target = np.empty((l, len_fs))
    feat_source = np.empty((l, len_fs))
    t1 = time()
    for i, x in enumerate(X):
        t=x[0]
        s=x[1]
        feat_target[i]=fs(graph, t)
        feat_source[i]=fs(graph, s)
        if i%10000==0:
             print("{}/{} completed".format(i,len(X)))
    print("VERTEX generation time taken",time()-t)
    return feat_target, feat_source

def generate_edge(graph, fs, X, len_fs=6):
    l = X.shape[0]
    feat_edge = np.empty((l, len_fs))
    t1 = time()
    for i, x in enumerate(X):
        t=x[0]
        s=x[1]
        feat_edge[i]=fs(graph, t, s)
        if i%10000==0:
            print("{}/{} completed".format(i,len(X)))
    print("EDGE_genetration_time_taken :",time()-t1)
    return feat_edge

def generate_algo(graph, X):
    res_alloc_index=np.asarray(list(nx.resource_allocation_index(graph, X)))[:,2]
    jac_coef=np.asarray(list(nx.jaccard_coefficient(graph, X)))[:,2]
    ad_adar_idx = np.asarray(list(nx.adamic_adar_index(graph, X)))[:,2]
    pref_att = np.asarray(list(nx.preferential_attachment(graph, X)))[:,2]
    return list(res_alloc_index), list(jac_coef), list(ad_adar_idx), list(pref_att)

def generate_numbers(graph, X):
    num_target = np.empty((X.shape[0], 3))
    num_source = np.empty((X.shape[0], 3))
    core_num = nx.core_number(graph)
    clus = nx.clustering(graph)
    page_rank = nx.pagerank(graph)
    t1 = time()
    for i, x in enumerate(X):
        num_target[i, 0]=core_num[x[0]]
        num_target[i, 1]=clus[x[0]]
        num_target[i, 2]=page_rank[x[0]]
        num_source[i, 0]=core_num[x[1]]
        num_source[i, 1]=clus[x[1]]
        num_source[i, 2]=page_rank[x[1]]
        if i%10000==0:
            print("{}/{} completed".format(i,len(X)))
    print("number_genetration_time_taken :",time()-t1)
    return num_target, num_source

def all_oriented_vertex(graph, v):
    neighbors_in = graph.predecessors(v)
    neighbors_out = graph.successors(v)
    neighbors = list(set(neighbors_in).union(neighbors_out))
    neighbors_plus = neighbors + [v]
    subgraph = graph.subgraph(neighbors)
    subgraph_plus = graph.subgraph(neighbors_plus)
    scc = nx.number_strongly_connected_components(subgraph)
    wcc = nx.number_weakly_connected_components(subgraph)
    scc_plus = nx.number_strongly_connected_components(subgraph_plus)
    return graph.in_degree(v), graph.out_degree(v), scc, wcc, scc_plus, neighbors_in, neighbors_out, neighbors, neighbors_plus

def generate_oriented(graph, X):
    %%time
    target_feats=np.empty((X.shape[0], 5))
    source_feats=np.empty((X.shape[0], 5))
    edge_feats = np.empty((X.shape[0], 11))
    l = X.shape[0]
    for i, x in enumerate(X):
        t=x[0]
        s=x[1]
        in_d_t, out_d_t, scc_t, wcc_t, sccp_t, n_in_t, n_out_t, n_t, np_t = all_oriented_vertex(graph, t)
        in_d_s, out_d_s, scc_s, wcc_s, sccp_s, n_in_s, n_out_s, n_s, np_s = all_oriented_vertex(graph, s)
        com_in = len(set(n_in_t).intersection(n_in_s))
        com_on = len(set(n_out_t).intersection(n_out_s))
        trans_ts = len(set(n_out_t).intersection(n_in_s))
        trans_st = len(set(n_out_s).intersection(n_in_t))
        friends_measure_st=0
        friends_measure_ts=0
        for ns in n_s:
            for nt in n_t:
                if graph.has_edge(ns, nt):
                    friends_measure_st+=1
                if graph.has_edge(nt, ns):
                    friends_measure_ts+=1
        nh = list(set(n_t).union(n_s))            
        nh_plus = list(set(np_t).union(np_s))
        sub_nh = graph.subgraph(nh)
        sub_nh_plus = graph.subgraph(nh_plus)
        scc = nx.number_strongly_connected_components(sub_nh)
        wcc = nx.number_weakly_connected_components(sub_nh)
        scc_plus = nx.number_strongly_connected_components(sub_nh_plus)
        if not nx.has_path(graph, s, t):
            len_path_st=-1
        else:
            len_path_st = nx.shortest_path_length(graph, s, t)
        if not nx.has_path(graph, t, s):
            len_path_ts=-1
        else:
            len_path_ts = nx.shortest_path_length(graph, t, s)
        target_feats[i]=[in_d_t, out_d_t, scc_t, wcc_t, sccp_t]
        source_feats[i]=[in_d_s, out_d_s, scc_s, wcc_s, sccp_s]
        edge_feats[i]=[com_in, com_on, trans_ts, trans_st, friends_measure_st, friends_measure_ts, scc, wcc, scc_plus, len_path_st, len_path_ts]
        if i%10000==0:
            print(i, l)
            t2=time()
            print(t2-t1)
            t1=t2
    return target_feats, source_feats, edge_feats

In [79]:
def generate_graph_features(train, K=5):
    X = train[['Target', 'Source']].values
    y = train[['Edge']].values
    target_feats=np.empty((train.shape[0], 5))
    source_feats=np.empty((train.shape[0], 5))
    edge_feats=np.empty((train.shape[0], 11))
    np.random.seed(7)
    cv = KFold(n_splits = K, shuffle = True, random_state=1)
    for i, (idx_train, idx_val) in enumerate(cv.split(train)):
        print("CV ITERATION {}".format())
        X_train = X[idx_train]
        y_train = y[idx_train]
        X_valid = X[idx_val]
        y_valid = X[idx_val]
        print("Creating graph")
        graph = create_directed_graph(X_train, y_train)
        print("Generating vertex features")
        feat_target, feat_source, feat_edge = generate_oriented(graph, X_valid)
        target_feats[idx_val] = feat_target
        source_feats[idx_val] = feat_source
        edge_feats[idx_val]=feat_edge
   
    return target_feats, source_feats, edge_feats

def generate_graph_features_test(train, test):
    t=time()
    X = train[['Target', 'Source']].values
    y = train[['Edge']].values
    X_test = test[['Target', 'Source']].values
    X_train = X
    y_train = y
    print("Creating graph")
    graph = create_directed_graph(X_train, y_train)
    print("Generating vertex features")
    feat_target, feat_source, feat_edge = generate_oriented(graph, X_test)
    return feat_target, feat_source, feat_edge

def give_graph_features(df,feat_target, feat_source, feat_edge):
    df['Target_indegree'] = feat_target[:,0]
    df['Target_outdegree'] = feat_target[:,1]
    df['Target_scc'] = feat_target[:,2]
    df['Target_wcc'] = feat_target[:,3]
    df['Target_scc_plus'] = feat_target[:,4]
    
    df['Source_indegree'] = feat_source[:,0]
    df['Source_outdegree'] = feat_source[:,1]
    df['Source_scc'] = feat_source[:,2]
    df['Source_wcc'] = feat_source[:,3]
    df['Source_scc_plus'] = feat_source[:,4]
    
    df['Common_in'] = feat_edge[:,0]
    df['Common_out'] = feat_edge[:,1]
    df['Transitive_ts'] = feat_edge[:,2]
    df['Transitive_st'] = feat_edge[:,3]
    df['Friend_measure_st'] = feat_edge[:,4]
    df['Friend_measure_ts'] = feat_edge[:,5]
    df['Scc'] = feat_edge[:,6]
    df['Wcc'] = feat_edge[:,7]
    df['Scc_plus'] = feat_edge[:,8]
    df['Len_path_st'] = feat_edge[:,9]
    df['Len_path_ts'] = feat_edge[:,10]
    
    return df

In [66]:
tf,sf,ef = generate_graph_features_test(train_semantics,test_semantics)

Creating graph
Generating vertex features
0 32648
0.08083271980285645
10000 32648
411.36806178092957
20000 32648
421.66673278808594
30000 32648
404.797518491745


In [67]:
test_semantics = give_graph_features(test_semantics,tf,sf,ef)

In [68]:
test_semantics.to_csv('test_sem_graph1.csv',index=False)

In [87]:
test_semantics.shape

(32648, 45)

In [90]:
# train_semantics = pd.read_csv('train_sem_graph1.csv')
# train_semantics.head()

##### Features related to vertex and edges

In [100]:
def give_vertex(train,K=3):
    X = train[['Target', 'Source']].values
    y = train[['Edge']].values
    target_vertex=np.empty((train.shape[0], 3))
    source_vertex=np.empty((train.shape[0], 3))
    np.random.seed(7)
    cv = KFold(n_splits = K, shuffle = True, random_state=1)
    for i, (idx_train, idx_val) in enumerate(cv.split(train)):
        X_train = X[idx_train]
        y_train = y[idx_train]
        X_valid = X[idx_val]
        y_valid = X[idx_val]
        print("Creating graph")
        graph =create_graph(X_train, y_train)
        print("Generating vertex features")
        vertex_target, vertex_source = generate_vertex(graph, all_vertex, X_valid)
        target_vertex[idx_val] = vertex_target
        source_vertex[idx_val] = vertex_source
    return target_vertex,source_vertex 

def give_number(train,K=3):
    X = train[['Target', 'Source']].values
    y = train[['Edge']].values
    target_num=np.empty((train.shape[0], 3))
    source_num=np.empty((train.shape[0], 3))
    np.random.seed(7)
    cv = KFold(n_splits = K, shuffle = True, random_state=1)
    for i, (idx_train, idx_val) in enumerate(cv.split(train)):
        X_train = X[idx_train]
        y_train = y[idx_train]
        X_valid = X[idx_val]
        y_valid = X[idx_val]
        print("Creating graph")
        graph =create_graph(X_train, y_train)
        print("Generating Number features")
        num_target, num_source = generate_numbers(graph, X_valid)
        target_num[idx_val] = num_target
        source_num[idx_val] = num_source
    return target_num,source_num 

def give_edge(train,K=3):
    X = train[['Target', 'Source']].values
    y = train[['Edge']].values
    feat_edge=np.empty((train.shape[0], 6))
    np.random.seed(7)
    cv = KFold(n_splits = K, shuffle = True, random_state=1)
    for i, (idx_train, idx_val) in enumerate(cv.split(train)):
        X_train = X[idx_train]
        y_train = y[idx_train]
        X_valid = X[idx_val]
        y_valid = X[idx_val]
        print("Creating graph")
        graph =create_graph(X_train, y_train)
        print("Generating Edge features")
        edge_feat = generate_edge(graph,all_edges, X_valid)
        feat_edge[idx_val]=edge_feat
        
    return feat_edge 

def generate_train_features(train, K=3, fs=all_edges, len_fs=6):
    print("Generating vertex features")
    vertex_target, vertex_source = give_vertex(train,K)
    
    print("Generate numbers")
    number_target, number_source = give_number(train,K)
    print("Generate edges")
    feat_edge = give_edge(train,K)
    return (vertex_target,vertex_source,number_target,number_source,feat_edge)

def generate_test_features(train, test,fs=all_edges, len_fs=6):

    X = train[['Target', 'Source']].values
    X_test = test[['Target', 'Source']].values
    y = train[['Edge']].values

    X_train = X
    y_train = y
    print("Creating graph")
    graph = create_graph(X_train, y_train)
    print("Generating vertex features")
    vertex_target, vertex_source = generate_vertex(graph, all_vertex, X_test)
    print("Generate numbers")
    number_target, number_source = generate_numbers(graph, X_test)
    print("Generate edges")
    feat_edge = generate_edge(graph, all_edges, X_test)
    return (vertex_target,vertex_source,number_target,number_source,feat_edge)

def give_another_graph_features(df,vertex_target,vertex_source,number_target,number_source,feat_edge):
    df['Target_degree'] = vertex_target[:,0] 
    df['Target_nh_subgraph_edges'] = vertex_target[:,1]
    df['Target_nh_subgraph_edges_plus'] = vertex_target[:,2] 
    df['Source_degree'] = vertex_source[:,0] 
    df['Source_nh_subgraph_edges'] = vertex_source[:,1] 
    df['Source_nh_subgraph_edges_plus'] = vertex_source[:,2] 
    
    df['Target_core'] = number_target[:,0] 
    df['Target_clustering'] = number_target[:,1]
    df['Target_pagerank'] = number_target[:,2] 
    df['Source_core'] = number_source[:,0] 
    df['Source_clustering'] = number_source[:,1]
    df['Source_pagerank'] = number_source[:,2] 
    
    df['Common_friends'] = feat_edge[:,0]
    df['Total_friends'] = feat_edge[:,1]
    df['Friends_measure'] = feat_edge[:,2]
    df['Sub_nh_edges'] = feat_edge[:,3]
    df['Sub_nh_edges_plus'] = feat_edge[:,4]
    df['Len_path'] = feat_edge[:,5]

    return df

In [91]:
train_ftrs = generate_train_features(train_semantics)

Generating vertex features
Creating graph
Generating vertex features
0/205171 completed
10000/205171 completed
20000/205171 completed
30000/205171 completed
40000/205171 completed
50000/205171 completed
60000/205171 completed
70000/205171 completed
80000/205171 completed
90000/205171 completed
100000/205171 completed
110000/205171 completed
120000/205171 completed
130000/205171 completed
140000/205171 completed
150000/205171 completed
160000/205171 completed
170000/205171 completed
180000/205171 completed
190000/205171 completed
200000/205171 completed
VERTEX generation time taken 1647515010.7882771
Creating graph
Generating vertex features
0/205171 completed
10000/205171 completed
20000/205171 completed
30000/205171 completed
40000/205171 completed
50000/205171 completed
60000/205171 completed
70000/205171 completed
80000/205171 completed
90000/205171 completed
100000/205171 completed
110000/205171 completed
120000/205171 completed
130000/205171 completed
140000/205171 completed
15000

In [101]:
a,b,c,d,e = train_ftrs[0],train_ftrs[1],train_ftrs[2],train_ftrs[3],train_ftrs[4]
train_semantics = give_another_graph_features(train_semantics,a,b,c,d,e)

In [102]:
train_semantics.shape

(615512, 63)

In [103]:
train_semantics.to_csv('train_total.csv',index=False)

In [106]:
test_ftrs = generate_test_features(train_semantics,test_semantics)

Creating graph
Generating vertex features
0/32648 completed
10000/32648 completed
20000/32648 completed
30000/32648 completed
VERTEX generation time taken 1647523506.5052304
Generate numbers
0/32648 completed
10000/32648 completed
20000/32648 completed
30000/32648 completed
number_genetration_time_taken : 0.056294918060302734
Generate edges
0/32648 completed
10000/32648 completed
20000/32648 completed
30000/32648 completed
EDGE_genetration_time_taken : 379.2655680179596


In [107]:
a,b,c,d,e = test_ftrs[0],test_ftrs[1],test_ftrs[2],test_ftrs[3],test_ftrs[4]
test_semantics = give_another_graph_features(test_semantics,a,b,c,d,e)
print(test_semantics.shape)
test_semantics.to_csv('test_total.csv',index=False)

(32648, 63)
