In [None]:
project_name = "reco-tut-ffr"; branch = "main"; account = "sparsh-ai"

In [None]:
!cp /content/drive/MyDrive/mykeys.py /content
import mykeys
!rm /content/mykeys.py
path = "/content/" + project_name; 
!mkdir "{path}"
%cd "{path}"
import sys; sys.path.append(path)
!git config --global user.email "nb@recohut.com"
!git config --global user.name  "colab-sparsh"
!git init
!git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
!git pull origin "{branch}"
!git checkout main

In [None]:
%cd "/content/reco-tut-ffr"

/content/reco-tut-ffr
/content/reco-tut-ffr


In [51]:
import os
import csv
import pickle
import random
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from scipy.sparse.linalg import svds, eigs
import networkx as nx
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [24]:
data_path_silver = './data/silver'

def read_df(name):
    return pd.read_parquet(os.path.join(data_path_silver,name+'.parquet.gzip'))

X_train_pos = read_df('X_train_pos')
X_train_neg = read_df('X_train_neg')
X_test_pos = read_df('X_test_pos')
X_test_neg = read_df('X_test_neg')

X_train = X_train_pos.append(X_train_neg, ignore_index=True)
X_test = X_test_pos.append(X_test_neg, ignore_index=True)
X_train.to_csv('/content/train_joined.csv', header=False, index=False)
X_test.to_csv('/content/test_joined.csv', header=False, index=False)

read_df('y_train').to_csv('/content/y_train.csv', header=False, index=False)
read_df('y_test').to_csv('/content/y_test.csv', header=False, index=False)

In [21]:
filename = "/content/train_joined.csv"
n_train = sum(1 for line in open(filename)) #number of records in file (excludes header)
s = 100000 #desired sample size
skip_train = sorted(random.sample(range(1,n_train+1),n_train-s))
#https://stackoverflow.com/a/22259008/4084039

In [22]:
filename = "/content/test_joined.csv"
n_test = sum(1 for line in open(filename)) #number of records in file (excludes header)
s = 50000 #desired sample size
skip_test = sorted(random.sample(range(1,n_test+1),n_test-s))

In [23]:
print("Number of rows in the train data file:", n_train)
print("Number of rows we are going to elimiate in train data are",len(skip_train))
print("Number of rows in the test data file:", n_test)
print("Number of rows we are going to elimiate in test data are",len(skip_test))

Number of rows in the train data file: 15100030
Number of rows we are going to elimiate in train data are 15000030
Number of rows in the test data file: 3775008
Number of rows we are going to elimiate in test data are 3725008


In [26]:
df_final_train = pd.read_csv('/content/train_joined.csv', skiprows=skip_train, names=['source_node', 'destination_node'])
df_final_train['indicator_link'] = pd.read_csv('/content/y_train.csv', skiprows=skip_train, names=['indicator_link'])
print("Our train matrix size ",df_final_train.shape)
df_final_train.head(2)

Our train matrix size  (100001, 3)


Unnamed: 0,source_node,destination_node,indicator_link
0,273084,1505602,1
1,536820,226930,1


In [27]:
df_final_test = pd.read_csv('/content/test_joined.csv', skiprows=skip_test, names=['source_node', 'destination_node'])
df_final_test['indicator_link'] = pd.read_csv('/content/y_test.csv', skiprows=skip_test, names=['indicator_link'])
print("Our test matrix size ",df_final_test.shape)
df_final_test.head(2)

Our test matrix size  (50001, 3)


Unnamed: 0,source_node,destination_node,indicator_link
0,848424,784690,1
1,1344580,342611,1


In [28]:
X_train_pos = read_df('X_train_pos')
train_graph = nx.from_pandas_edgelist(X_train_pos,
                            source='source_node',
                            target='destination_node',
                            create_using=nx.DiGraph())

In [7]:
data_path_gold = './data/gold'
if not os.path.exists(data_path_gold):
    os.makedirs(data_path_gold)

### Similarity measures

#### Jaccard distance

\begin{equation}
j = \frac{|X\cap Y|}{|X \cup Y|} 
\end{equation}

In [11]:
def jaccard_for_followees(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0  | len(set(train_graph.successors(b))) == 0:
            return 0
        sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
                                    (len(set(train_graph.successors(a)).union(set(train_graph.successors(b)))))
    except:
        return 0
    return sim

In [12]:
def jaccard_for_followers(a,b):
    try:
        if len(set(train_graph.predecessors(a))) == 0  | len(set(g.predecessors(b))) == 0:
            return 0
        sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
                                 (len(set(train_graph.predecessors(a)).union(set(train_graph.predecessors(b)))))
        return sim
    except:
        return 0

#### Cosine distance

\begin{equation}
CosineDistance = \frac{|X\cap Y|}{|X|\cdot|Y|} 
\end{equation}

In [13]:
def cosine_for_followees(a,b):
    try:
        if len(set(train_graph.successors(a))) == 0  | len(set(train_graph.successors(b))) == 0:
            return 0
        sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
                                    (math.sqrt(len(set(train_graph.successors(a)))*len((set(train_graph.successors(b))))))
        return sim
    except:
        return 0

In [14]:
def cosine_for_followers(a,b):
    try:
        
        if len(set(train_graph.predecessors(a))) == 0  | len(set(train_graph.predecessors(b))) == 0:
            return 0
        sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
                                     (math.sqrt(len(set(train_graph.predecessors(a))))*(len(set(train_graph.predecessors(b)))))
        return sim
    except:
        return 0

### Ranking measures

#### Pagerank

In [None]:
pr = nx.pagerank(train_graph, alpha=0.85)
pickle.dump(pr,open(os.path.join(data_path_gold,'page_rank.p'),'wb'))

In [43]:
print('min',pr[min(pr, key=pr.get)])
print('max',pr[max(pr, key=pr.get)])
#for imputing to nodes which are not there in Train data
print('mean_pr',float(sum(pr.values())) / len(pr))

min 1.6556497245737814e-07
max 2.7098251341935827e-05
mean_pr 5.615699699389075e-07


### Other graph features

#### Shortest path

Getting Shortest path between two nodes, and if any 2 given nodes have a direct path i.e directly connected then we are removing that edge and calculating path.

In [15]:
def compute_shortest_path_length(a,b):
    p=-1
    try:
        if train_graph.has_edge(a,b):
            train_graph.remove_edge(a,b)
            p= nx.shortest_path_length(train_graph,source=a,target=b)
            train_graph.add_edge(a,b)
        else:
            p= nx.shortest_path_length(train_graph,source=a,target=b)
        return p
    except:
        return -1

In [None]:
# unit test 1
compute_shortest_path_length(77697, 826021)

10

In [None]:
# unit test 2
compute_shortest_path_length(669354, 1635354)

-1

#### Same community

In [29]:
wcc = list(nx.weakly_connected_components(train_graph))

In [16]:
def belongs_to_same_wcc(a,b):
    index = []
    if train_graph.has_edge(b,a):
        return 1
    if train_graph.has_edge(a,b):
            for i in wcc:
                if a in i:
                    index= i
                    break
            if (b in index):
                train_graph.remove_edge(a,b)
                if compute_shortest_path_length(a,b)==-1:
                    train_graph.add_edge(a,b)
                    return 0
                else:
                    train_graph.add_edge(a,b)
                    return 1
            else:
                return 0
    else:
            for i in wcc:
                if a in i:
                    index= i
                    break
            if(b in index):
                return 1
            else:
                return 0

#### Admaic/Adar index

Adamic/Adar measures is defined as inverted sum of degrees of common neighbours for given two vertices: $A(x,y)=\sum_{u \in N(x) \cap N(y)}\frac{1}{log(|N(u)|)}$

In [17]:
def calc_adar_in(a,b):
    sum=0
    try:
        n=list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
        if len(n)!=0:
            for i in n:
                sum=sum+(1/np.log10(len(list(train_graph.predecessors(i)))))
            return sum
        else:
            return 0
    except:
        return 0

### Is person following back?

In [18]:
def follows_back(a,b):
    if train_graph.has_edge(b,a):
        return 1
    else:
        return 0

#### Katz centrality

Katz centrality computes the centrality for a node based on the centrality of its neighbors. It is a generalization of the eigenvector centrality. The Katz centrality for node i is: $x_i = \alpha \sum_{j} A_{ij} x_j + \beta$

In [None]:
katz = nx.katz.katz_centrality(train_graph,alpha=0.005,beta=1)
pickle.dump(katz,open(os.path.join(data_path_gold,'katz.p'),'wb'))

In [44]:
print('min',katz[min(katz, key=katz.get)])
print('max',katz[max(katz, key=katz.get)])
print('mean',float(sum(katz.values())) / len(katz))

min 0.0007313532484065916
max 0.003394554981699122
mean 0.0007483800935562018


## Adding a set of features
we will create these each of these features for both train and test data points:
- jaccard_followers
- jaccard_followees
- cosine_followers
- cosine_followees
- num_followers_s
- num_followees_s
- num_followers_d
- num_followees_d
- inter_followers
- inter_followees

In [30]:
#mapping jaccrd followers to train and test data
df_final_train['jaccard_followers'] = df_final_train.apply(lambda row:
                                        jaccard_for_followers(row['source_node'],row['destination_node']),axis=1)
df_final_test['jaccard_followers'] = df_final_test.apply(lambda row:
                                        jaccard_for_followers(row['source_node'],row['destination_node']),axis=1)

#mapping jaccrd followees to train and test data
df_final_train['jaccard_followees'] = df_final_train.apply(lambda row:
                                        jaccard_for_followees(row['source_node'],row['destination_node']),axis=1)
df_final_test['jaccard_followees'] = df_final_test.apply(lambda row:
                                        jaccard_for_followees(row['source_node'],row['destination_node']),axis=1)


#mapping jaccrd followers to train and test data
df_final_train['cosine_followers'] = df_final_train.apply(lambda row:
                                        cosine_for_followers(row['source_node'],row['destination_node']),axis=1)
df_final_test['cosine_followers'] = df_final_test.apply(lambda row:
                                        cosine_for_followers(row['source_node'],row['destination_node']),axis=1)

#mapping jaccrd followees to train and test data
df_final_train['cosine_followees'] = df_final_train.apply(lambda row:
                                        cosine_for_followees(row['source_node'],row['destination_node']),axis=1)
df_final_test['cosine_followees'] = df_final_test.apply(lambda row:
                                        cosine_for_followees(row['source_node'],row['destination_node']),axis=1)

In [31]:
def compute_features_stage1(df_final):
    #calculating no of followers followees for source and destination
    #calculating intersection of followers and followees for source and destination
    num_followers_s=[]
    num_followees_s=[]
    num_followers_d=[]
    num_followees_d=[]
    inter_followers=[]
    inter_followees=[]
    for i,row in df_final.iterrows():
        try:
            s1=set(train_graph.predecessors(row['source_node']))
            s2=set(train_graph.successors(row['source_node']))
        except:
            s1 = set()
            s2 = set()
        try:
            d1=set(train_graph.predecessors(row['destination_node']))
            d2=set(train_graph.successors(row['destination_node']))
        except:
            d1 = set()
            d2 = set()
        num_followers_s.append(len(s1))
        num_followees_s.append(len(s2))

        num_followers_d.append(len(d1))
        num_followees_d.append(len(d2))

        inter_followers.append(len(s1.intersection(d1)))
        inter_followees.append(len(s2.intersection(d2)))
    
    return num_followers_s, num_followers_d, num_followees_s, num_followees_d, inter_followers, inter_followees

In [32]:
if not os.path.isfile(os.path.join(data_path_gold, 'storage_sample_stage1.h5')):
    df_final_train['num_followers_s'], df_final_train['num_followers_d'], \
    df_final_train['num_followees_s'], df_final_train['num_followees_d'], \
    df_final_train['inter_followers'], df_final_train['inter_followees']= compute_features_stage1(df_final_train)
    
    df_final_test['num_followers_s'], df_final_test['num_followers_d'], \
    df_final_test['num_followees_s'], df_final_test['num_followees_d'], \
    df_final_test['inter_followers'], df_final_test['inter_followees']= compute_features_stage1(df_final_test)
    
    hdf = pd.HDFStore(os.path.join(data_path_gold, 'storage_sample_stage1.h5'))
    hdf.put('train_df',df_final_train, format='table', data_columns=True)
    hdf.put('test_df',df_final_test, format='table', data_columns=True)
    hdf.close()
else:
    df_final_train = pd.read_hdf(os.path.join(data_path_gold, 'storage_sample_stage1.h5'), 'train_df',mode='r')
    df_final_test = pd.read_hdf(os.path.join(data_path_gold, 'storage_sample_stage1.h5'), 'test_df',mode='r')

## Adding new set of features
we will create these each of these features for both train and test data points:
- adar index
- is following back
- belongs to same weakly connect components
- shortest path between source and destination

In [35]:
if not os.path.isfile(os.path.join(data_path_gold, 'storage_sample_stage2.h5')):
    #mapping adar index on train
    df_final_train['adar_index'] = df_final_train.apply(lambda row: calc_adar_in(row['source_node'],row['destination_node']),axis=1)
    #mapping adar index on test
    df_final_test['adar_index'] = df_final_test.apply(lambda row: calc_adar_in(row['source_node'],row['destination_node']),axis=1)

    #--------------------------------------------------------------------------------------------------------
    #mapping followback or not on train
    df_final_train['follows_back'] = df_final_train.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1)

    #mapping followback or not on test
    df_final_test['follows_back'] = df_final_test.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1)

    #--------------------------------------------------------------------------------------------------------
    #mapping same component of wcc or not on train
    df_final_train['same_comp'] = df_final_train.apply(lambda row: belongs_to_same_wcc(row['source_node'],row['destination_node']),axis=1)

    ##mapping same component of wcc or not on train
    df_final_test['same_comp'] = df_final_test.apply(lambda row: belongs_to_same_wcc(row['source_node'],row['destination_node']),axis=1)
    
    #--------------------------------------------------------------------------------------------------------
    #mapping shortest path on train 
    df_final_train['shortest_path'] = df_final_train.apply(lambda row: compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1)
    #mapping shortest path on test
    df_final_test['shortest_path'] = df_final_test.apply(lambda row: compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1)

    hdf = pd.HDFStore(os.path.join(data_path_gold, 'storage_sample_stage2.h5'))
    hdf.put('train_df',df_final_train, format='table', data_columns=True)
    hdf.put('test_df',df_final_test, format='table', data_columns=True)
    hdf.close()
else:
    df_final_train = pd.read_hdf(os.path.join(data_path_gold, 'storage_sample_stage2.h5'), 'train_df',mode='r')
    df_final_test = pd.read_hdf(os.path.join(data_path_gold, 'storage_sample_stage2.h5'), 'test_df',mode='r')

## Adding new set of features
we will create these each of these features for both train and test data points:
- Weight Features
    - weight of incoming edges
    - weight of outgoing edges
    - weight of incoming edges + weight of outgoing edges
    - weight of incoming edges * weight of outgoing edges
    - 2*weight of incoming edges + weight of outgoing edges
    - weight of incoming edges + 2*weight of outgoing edges
- Page Ranking of source
- Page Ranking of dest
- katz of source
- katz of dest
- hubs of source
- hubs of dest
- authorities_s of source
- authorities_s of dest

### Weight Features
In order to determine the similarity of nodes, an edge weight value was calculated between nodes. Edge weight decreases as the neighbor count goes up. Intuitively, consider one million people following a celebrity on a social network then chances are most of them never met each other or the celebrity. On the other hand, if a user has 30 contacts in his/her social network, the chances are higher that many of them know each other. credit - Graph-based Features for Supervised Link Prediction William Cukierski, Benjamin Hamner, Bo Yang

$W = \frac{1}{\sqrt{1+|X|}}$

it is directed graph so calculated Weighted in and Weighted out differently.



In [37]:
#weight for source and destination of each link
Weight_in = {}
Weight_out = {}
for i in  tqdm(train_graph.nodes()):
    s1=set(train_graph.predecessors(i))
    w_in = 1.0/(np.sqrt(1+len(s1)))
    Weight_in[i]=w_in
    
    s2=set(train_graph.successors(i))
    w_out = 1.0/(np.sqrt(1+len(s2)))
    Weight_out[i]=w_out
    
#for imputing with mean
mean_weight_in = np.mean(list(Weight_in.values()))
mean_weight_out = np.mean(list(Weight_out.values()))

HBox(children=(FloatProgress(value=0.0, max=1780722.0), HTML(value='')))




In [38]:
#mapping to pandas train
df_final_train['weight_in'] = df_final_train.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
df_final_train['weight_out'] = df_final_train.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))

#mapping to pandas test
df_final_test['weight_in'] = df_final_test.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
df_final_test['weight_out'] = df_final_test.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))

#some features engineerings on the in and out weights
df_final_train['weight_f1'] = df_final_train.weight_in + df_final_train.weight_out
df_final_train['weight_f2'] = df_final_train.weight_in * df_final_train.weight_out
df_final_train['weight_f3'] = (2*df_final_train.weight_in + 1*df_final_train.weight_out)
df_final_train['weight_f4'] = (1*df_final_train.weight_in + 2*df_final_train.weight_out)

#some features engineerings on the in and out weights
df_final_test['weight_f1'] = df_final_test.weight_in + df_final_test.weight_out
df_final_test['weight_f2'] = df_final_test.weight_in * df_final_test.weight_out
df_final_test['weight_f3'] = (2*df_final_test.weight_in + 1*df_final_test.weight_out)
df_final_test['weight_f4'] = (1*df_final_test.weight_in + 2*df_final_test.weight_out)

In [46]:
pr = pickle.load(open(os.path.join(data_path_gold,'page_rank.p'),'rb'))
mean_pr = float(sum(pr.values())) / len(pr)

katz = pickle.load(open(os.path.join(data_path_gold,'katz.p'),'rb'))
mean_katz = float(sum(katz.values())) / len(katz)

In [47]:
if not os.path.isfile(os.path.join(data_path_gold, 'storage_sample_stage3.h5')):
    
    #page rank for source and destination in Train and Test
    #if anything not there in train graph then adding mean page rank 
    df_final_train['page_rank_s'] = df_final_train.source_node.apply(lambda x:pr.get(x,mean_pr))
    df_final_train['page_rank_d'] = df_final_train.destination_node.apply(lambda x:pr.get(x,mean_pr))

    df_final_test['page_rank_s'] = df_final_test.source_node.apply(lambda x:pr.get(x,mean_pr))
    df_final_test['page_rank_d'] = df_final_test.destination_node.apply(lambda x:pr.get(x,mean_pr))
    #================================================================================

    #Katz centrality score for source and destination in Train and test
    #if anything not there in train graph then adding mean katz score
    df_final_train['katz_s'] = df_final_train.source_node.apply(lambda x: katz.get(x,mean_katz))
    df_final_train['katz_d'] = df_final_train.destination_node.apply(lambda x: katz.get(x,mean_katz))

    df_final_test['katz_s'] = df_final_test.source_node.apply(lambda x: katz.get(x,mean_katz))
    df_final_test['katz_d'] = df_final_test.destination_node.apply(lambda x: katz.get(x,mean_katz))
    #================================================================================

    hdf = pd.HDFStore(os.path.join(data_path_gold, 'storage_sample_stage3.h5'))
    hdf.put('train_df',df_final_train, format='table', data_columns=True)
    hdf.put('test_df',df_final_test, format='table', data_columns=True)
    hdf.close()
else:
    df_final_train = pd.read_hdf(os.path.join(data_path_gold, 'storage_sample_stage3.h5'), 'train_df',mode='r')
    df_final_test = pd.read_hdf(os.path.join(data_path_gold, 'storage_sample_stage3.h5'), 'test_df',mode='r')

### Adding new feature Preferential Attachement
One well-known concept in social networks is that users with many friends tend to create more connections in the future. This is due to the fact that in some social networks, like in finance, the rich get richer. We estimate how ”rich” our two vertices are by calculating the multiplication between the number of friends (|Γ(x)|) or followers each vertex has.

In [48]:
# Preferential Attachement for followers
#for train dataset
nfs=np.array(df_final_train['num_followers_s'])
nfd=np.array(df_final_train['num_followers_d'])
preferential_followers=[]
for i in range(len(nfs)):
    preferential_followers.append(nfd[i]*nfs[i])
df_final_train['prefer_Attach_followers']= preferential_followers

#for test dataset
nfs=np.array(df_final_test['num_followers_s'])
nfd=np.array(df_final_test['num_followers_d'])
preferential_followers=[]
for i in range(len(nfs)):
    preferential_followers.append(nfd[i]*nfs[i])
df_final_test['prefer_Attach_followers']= preferential_followers

# Preferential Attachement for followers
#for train dataset
nfs=np.array(df_final_train['num_followees_s'])
nfd=np.array(df_final_train['num_followees_d'])
preferential_followees=[]
for i in range(len(nfs)):
    preferential_followees.append(nfd[i]*nfs[i])
df_final_train['prefer_Attach_followees']= preferential_followees

#for test dataset
nfs=np.array(df_final_test['num_followees_s'])
nfd=np.array(df_final_test['num_followees_d'])
preferential_followees=[]
for i in range(len(nfs)):
    preferential_followees.append(nfd[i]*nfs[i])
df_final_test['prefer_Attach_followees']= preferential_followees

### SVD features for both source and destination

In [49]:
def svd(x, S):
    try:
        z = sadj_dict[x]
        return S[z]
    except:
        return [0,0,0,0,0,0]

In [52]:
#for svd features to get feature vector creating a dict node val and index in svd vector
sadj_col = sorted(train_graph.nodes())
sadj_dict = { val:idx for idx,val in enumerate(sadj_col)}

Adj = nx.adjacency_matrix(train_graph,nodelist=sorted(train_graph.nodes())).asfptype()

U, s, V = svds(Adj, k = 6)
print('Adjacency matrix Shape',Adj.shape)
print('U Shape',U.shape)
print('V Shape',V.shape)
print('s Shape',s.shape)

Adjacency matrix Shape (1780722, 1780722)
U Shape (1780722, 6)
V Shape (6, 1780722)
s Shape (6,)


In [53]:
df_final_train[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
df_final_train.source_node.apply(lambda x: svd(x, U)).apply(pd.Series)

df_final_train[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
df_final_train.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series)
#===================================================================================================

df_final_train[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
df_final_train.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)

df_final_train[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
df_final_train.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)
#===================================================================================================

df_final_test[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
df_final_test.source_node.apply(lambda x: svd(x, U)).apply(pd.Series)

df_final_test[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
df_final_test.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series)

#===================================================================================================

df_final_test[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
df_final_test.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)

df_final_test[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
df_final_test.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)

In [54]:
df_final_train.columns

Index(['source_node', 'destination_node', 'indicator_link',
       'jaccard_followers', 'jaccard_followees', 'cosine_followers',
       'cosine_followees', 'num_followers_s', 'num_followers_d',
       'num_followees_s', 'num_followees_d', 'inter_followers',
       'inter_followees', 'adar_index', 'follows_back', 'same_comp',
       'shortest_path', 'weight_in', 'weight_out', 'weight_f1', 'weight_f2',
       'weight_f3', 'weight_f4', 'page_rank_s', 'page_rank_d', 'katz_s',
       'katz_d', 'prefer_Attach_followers', 'prefer_Attach_followees',
       'svd_u_s_1', 'svd_u_s_2', 'svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5',
       'svd_u_s_6', 'svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4',
       'svd_u_d_5', 'svd_u_d_6', 'svd_v_s_1', 'svd_v_s_2', 'svd_v_s_3',
       'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6', 'svd_v_d_1', 'svd_v_d_2',
       'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5', 'svd_v_d_6'],
      dtype='object')

### Adding feature svd_dot
svd_dot is Dot product between sourse node svd and destination node svd features

In [55]:
#for train datasets
s1,s2,s3,s4,s5,s6=df_final_train['svd_u_s_1'],df_final_train['svd_u_s_2'],df_final_train['svd_u_s_3'],df_final_train['svd_u_s_4'],df_final_train['svd_u_s_5'],df_final_train['svd_u_s_6']
s7,s8,s9,s10,s11,s12=df_final_train['svd_v_s_1'],df_final_train['svd_v_s_2'],df_final_train['svd_v_s_3'],df_final_train['svd_v_s_4'],df_final_train['svd_v_s_5'],df_final_train['svd_v_s_6']

d1,d2,d3,d4,d5,d6=df_final_train['svd_u_d_1'],df_final_train['svd_u_d_2'],df_final_train['svd_u_d_3'],df_final_train['svd_u_d_4'],df_final_train['svd_u_d_5'],df_final_train['svd_u_d_6']
d7,d8,d9,d10,d11,d12=df_final_train['svd_v_d_1'],df_final_train['svd_v_d_2'],df_final_train['svd_v_d_3'],df_final_train['svd_v_d_4'],df_final_train['svd_v_d_5'],df_final_train['svd_v_d_6']

In [56]:
svd_dot=[]
for i in range(len(np.array(s1))):
    a=[]
    b=[]
    a.append(np.array(s1[i]))
    a.append(np.array(s2[i]))
    a.append(np.array(s3[i]))
    a.append(np.array(s4[i]))
    a.append(np.array(s5[i]))
    a.append(np.array(s6[i]))
    a.append(np.array(s7[i]))
    a.append(np.array(s8[i]))
    a.append(np.array(s9[i]))
    a.append(np.array(s10[i]))
    a.append(np.array(s11[i]))
    a.append(np.array(s12[i]))
    b.append(np.array(d1[i]))
    b.append(np.array(d2[i]))
    b.append(np.array(d3[i]))
    b.append(np.array(d4[i]))
    b.append(np.array(d5[i]))
    b.append(np.array(d6[i]))
    b.append(np.array(d7[i]))
    b.append(np.array(d8[i]))
    b.append(np.array(d9[i]))
    b.append(np.array(d10[i]))
    b.append(np.array(d11[i]))
    b.append(np.array(d12[i]))
    svd_dot.append(np.dot(a,b))
    
df_final_train['svd_dot']=svd_dot   

In [57]:
df_final_train.head()

Unnamed: 0,source_node,destination_node,indicator_link,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees,adar_index,follows_back,same_comp,shortest_path,weight_in,weight_out,weight_f1,weight_f2,weight_f3,weight_f4,page_rank_s,page_rank_d,katz_s,katz_d,prefer_Attach_followers,prefer_Attach_followees,svd_u_s_1,svd_u_s_2,svd_u_s_3,svd_u_s_4,svd_u_s_5,svd_u_s_6,svd_u_d_1,svd_u_d_2,svd_u_d_3,svd_u_d_4,svd_u_d_5,svd_u_d_6,svd_v_s_1,svd_v_s_2,svd_v_s_3,svd_v_s_4,svd_v_s_5,svd_v_s_6,svd_v_d_1,svd_v_d_2,svd_v_d_3,svd_v_d_4,svd_v_d_5,svd_v_d_6,svd_dot
0,273084,1505602,1,0,0.0,0,0,11,6,15,8,0,0,0.0,0,1,4,0.377964,0.25,0.627964,0.094491,1.005929,0.877964,2.04529e-06,3.459963e-07,0.000773,0.000756,66,120,-1.666376e-13,4.61367e-13,1.043022e-05,6.677503e-13,2.451075e-13,3.584762e-12,-2.038018e-11,5.397735e-13,1.06894e-06,1.19241e-12,2.279486e-12,3.58102e-12,-7.21611e-13,3.926207e-13,1.983688e-06,1.545078e-13,8.108382e-13,1.719699e-14,-1.355354e-12,4.675181e-13,1.128577e-06,6.616539e-14,9.771045e-13,4.159814e-14,1.338803e-11
1,536820,226930,1,0,0.0,0,0,3,8,8,8,0,0,0.0,0,1,6,0.333333,0.333333,0.666667,0.111111,1.0,1.0,2.55556e-07,7.900776e-07,0.000743,0.000761,24,64,-1.000683e-13,2.416715e-13,1.567253e-12,6.207936e-12,3.122976e-13,1.859408e-12,-5.872395e-15,3.07529e-12,1.076685e-13,6.914915e-14,2.051733e-12,9.657654e-16,-3.796453e-13,1.341099e-13,6.45503e-12,2.98853e-12,1.671777e-13,9.261643e-14,-2.839895e-13,4.529009e-12,4.620106e-13,1.273286e-13,8.783099e-14,1.604342e-14,6.07855e-24
2,53024,470337,1,0,0.1,0,0,5,6,5,6,1,1,1.0,1,1,2,0.377964,0.408248,0.786213,0.154303,1.164177,1.194461,1.711219e-06,9.779807e-07,0.00075,0.000754,30,30,-5.739548e-11,1.508032e-10,8.17449e-12,1.178467e-10,2.465994e-11,1.264218e-12,-3.662229e-13,2.117166e-11,1.974638e-11,1.799748e-11,1.650662e-11,7.129315e-13,-1.200755e-12,1.549722e-12,1.029112e-11,1.103299e-11,2.430948e-11,4.198859e-11,-2.078009e-12,3.596208e-12,3.750138e-11,1.377155e-11,2.457679e-11,4.199957e-11,8.810981e-21
3,1417605,1230379,1,0,0.588235,0,0,22,29,36,45,17,30,18.915117,1,1,2,0.182574,0.164399,0.346973,0.030015,0.529547,0.511372,4.447768e-07,5.524609e-07,0.00083,0.000861,638,1620,-9.182749e-18,4.8644700000000004e-17,1.293195e-15,1.701462e-17,3.4022420000000004e-17,-2.861893e-18,-1.092711e-17,6.139269000000001e-17,1.663824e-15,2.0991970000000003e-17,4.2537490000000005e-17,-2.691626e-18,-2.423041e-18,2.2645680000000003e-17,6.880258e-16,9.507225e-18,2.1060430000000003e-17,-3.1315699999999995e-19,-3.476597e-18,3.792054e-17,1.187459e-15,1.661522e-17,3.8200820000000003e-17,-3.1363769999999996e-19,2.975379e-30
4,896938,1135133,1,0,0.0,0,0,1,10,2,10,0,0,0.0,1,1,10,0.301511,0.57735,0.878862,0.174078,1.180373,1.456212,2.132984e-07,1.394701e-06,0.000735,0.000769,10,20,-2.077938e-15,6.781134e-15,2.228023e-15,6.800924000000001e-17,1.920533e-16,1.2104720000000001e-17,-2.163061e-15,1.936589e-14,9.044099e-14,6.999872e-14,1.347763e-15,1.197486e-16,-2.1911160000000002e-17,1.929638e-16,8.779132e-16,6.690169e-16,1.0768530000000002e-17,8.507622999999999e-19,-7.727098e-16,4.763126e-15,1.038215e-13,6.545172e-15,1.729064e-14,1.693182e-15,4.389916e-28


In [58]:
#for test dataset
s1,s2,s3,s4,s5,s6=df_final_test['svd_u_s_1'],df_final_test['svd_u_s_2'],df_final_test['svd_u_s_3'],df_final_test['svd_u_s_4'],df_final_test['svd_u_s_5'],df_final_test['svd_u_s_6']
s7,s8,s9,s10,s11,s12=df_final_test['svd_v_s_1'],df_final_test['svd_v_s_2'],df_final_test['svd_v_s_3'],df_final_test['svd_v_s_4'],df_final_test['svd_v_s_5'],df_final_test['svd_v_s_6']

d1,d2,d3,d4,d5,d6=df_final_test['svd_u_d_1'],df_final_test['svd_u_d_2'],df_final_test['svd_u_d_3'],df_final_test['svd_u_d_4'],df_final_test['svd_u_d_5'],df_final_test['svd_u_d_6']
d7,d8,d9,d10,d11,d12=df_final_test['svd_v_d_1'],df_final_test['svd_v_d_2'],df_final_test['svd_v_d_3'],df_final_test['svd_v_d_4'],df_final_test['svd_v_d_5'],df_final_test['svd_v_d_6']

In [59]:
svd_dot=[]
for i in range(len(np.array(s1))):
    a=[]
    b=[]
    a.append(np.array(s1[i]))
    a.append(np.array(s2[i]))
    a.append(np.array(s3[i]))
    a.append(np.array(s4[i]))
    a.append(np.array(s5[i]))
    a.append(np.array(s6[i]))
    a.append(np.array(s7[i]))
    a.append(np.array(s8[i]))
    a.append(np.array(s9[i]))
    a.append(np.array(s10[i]))
    a.append(np.array(s11[i]))
    a.append(np.array(s12[i]))
    b.append(np.array(d1[i]))
    b.append(np.array(d2[i]))
    b.append(np.array(d3[i]))
    b.append(np.array(d4[i]))
    b.append(np.array(d5[i]))
    b.append(np.array(d6[i]))
    b.append(np.array(d7[i]))
    b.append(np.array(d8[i]))
    b.append(np.array(d9[i]))
    b.append(np.array(d10[i]))
    b.append(np.array(d11[i]))
    b.append(np.array(d12[i]))
    svd_dot.append(np.dot(a,b))
    
df_final_test['svd_dot']=svd_dot 

In [60]:
df_final_test.head()

Unnamed: 0,source_node,destination_node,indicator_link,jaccard_followers,jaccard_followees,cosine_followers,cosine_followees,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees,adar_index,follows_back,same_comp,shortest_path,weight_in,weight_out,weight_f1,weight_f2,weight_f3,weight_f4,page_rank_s,page_rank_d,katz_s,katz_d,prefer_Attach_followers,prefer_Attach_followees,svd_u_s_1,svd_u_s_2,svd_u_s_3,svd_u_s_4,svd_u_s_5,svd_u_s_6,svd_u_d_1,svd_u_d_2,svd_u_d_3,svd_u_d_4,svd_u_d_5,svd_u_d_6,svd_v_s_1,svd_v_s_2,svd_v_s_3,svd_v_s_4,svd_v_s_5,svd_v_s_6,svd_v_d_1,svd_v_d_2,svd_v_d_3,svd_v_d_4,svd_v_d_5,svd_v_d_6,svd_dot
0,848424,784690,1,0,0.0,0,0,6,14,6,9,1,0,0.0,1,1,2,0.258199,0.377964,0.636163,0.09759,0.894362,1.014128,6.557971e-07,2e-06,0.000754,0.000786,84,54,-9.98798e-12,2.283706e-13,1.439885e-10,6.136172e-13,4.188169e-13,5.983462e-15,-1.026186e-11,5.31656e-13,5.843125e-10,1.166046e-13,2.253352e-11,3.220432e-15,-2.14885e-13,1.883264e-13,5.904787e-11,2.701538e-12,4.341613e-13,5.535503e-14,-9.994077e-10,5.791927e-10,3.512349e-07,2.48666e-09,2.77114e-09,1.727694e-12,2.083217e-17
1,1344580,342611,1,0,0.0125,0,0,50,26,56,25,1,1,1.285097,1,1,2,0.19245,0.132453,0.324903,0.025491,0.517353,0.457357,3.740596e-06,3e-06,0.000934,0.000835,1300,1400,-1.041012e-10,2.45091e-10,1.636445e-06,1.744189e-10,2.403377e-10,1.761182e-12,-2.12738e-11,1.010533e-09,1.199321e-08,1.693617e-09,3.864187e-11,1.945994e-12,-3.144714e-11,3.702433e-10,1.013765e-07,6.055064e-11,2.34681e-09,1.941724e-11,-4.075337e-12,8.025003e-12,4.438231e-10,1.99594e-11,2.923692e-10,1.085116e-13,1.967248e-14
2,1294891,1447581,1,0,0.0,0,0,6,11,5,13,0,0,0.0,1,1,3,0.288675,0.408248,0.696923,0.117851,0.985599,1.105172,4.271048e-07,1e-06,0.000755,0.000774,66,65,-2.936992e-11,9.718611e-12,5.022905e-11,6.553552e-12,1.321321e-11,3.85484e-14,-5.969443e-14,7.834291e-11,3.476005e-12,1.887946e-13,1.554217e-11,1.678442e-15,-1.587702e-12,3.182815e-11,1.402614e-11,7.735058e-12,4.660135e-12,8.114806e-16,-2.966961e-13,3.001932e-11,5.479215e-12,1.40304e-12,2.755603e-10,1.063588e-15,3.472117e-21
3,1802128,1634556,1,0,0.037736,0,0,46,14,38,17,1,2,2.308604,1,1,2,0.258199,0.160128,0.418327,0.041345,0.676526,0.578455,2.738619e-06,1e-06,0.000921,0.000786,644,646,-2.62313e-12,7.000651e-08,4.281443e-06,4.331846e-09,3.224032e-11,8.991486e-12,-1.146981e-11,9.191004e-11,9.061251e-06,2.877213e-10,1.644661e-11,5.363448e-12,-1.485085e-11,7.087963e-09,4.194574e-05,2.988299e-08,1.145909e-11,5.002974e-10,-2.22502e-12,3.04886e-12,5.197992e-07,5.928659e-12,1.166038e-12,1.579162e-13,6.05986e-11
4,1505897,1828850,1,0,0.020408,0,0,41,19,23,27,1,1,0.574593,1,1,2,0.223607,0.204124,0.427731,0.045644,0.651338,0.631855,3.403581e-06,3e-06,0.000898,0.000809,779,621,-2.507284e-11,1.413282e-11,2.109213e-05,1.787929e-12,5.404061e-12,1.931283e-12,-4.699055e-11,3.834546e-11,2.136433e-05,1.609714e-12,5.680389e-11,1.977754e-12,-2.147159e-11,6.603848e-12,2.194823e-05,5.864301e-12,1.323759e-09,7.844954e-13,-1.830513e-11,5.91176e-12,6.59057e-05,3.453278e-12,1.231581e-11,7.614798e-13,1.897133e-09


In [61]:
hdf = pd.HDFStore(os.path.join(data_path_gold,'storage_sample_stage4.h5'))
hdf.put('train_df',df_final_train, format='table', data_columns=True)
hdf.put('test_df',df_final_test, format='table', data_columns=True)
hdf.close()

In [62]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mdata/gold/[m

nothing added to commit but untracked files present (use "git add" to track)


In [63]:
!git add .
!git commit -m 'added gold data layer'

[main da0d36f] added gold data layer
 6 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data/gold/katz.p
 create mode 100644 data/gold/page_rank.p
 create mode 100644 data/gold/storage_sample_stage1.h5
 create mode 100644 data/gold/storage_sample_stage2.h5
 create mode 100644 data/gold/storage_sample_stage3.h5
 create mode 100644 data/gold/storage_sample_stage4.h5


In [64]:
!git push origin main

Counting objects: 10, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (10/10), done.
Writing objects: 100% (10/10), 102.44 MiB | 5.19 MiB/s, done.
Total 10 (delta 4), reused 0 (delta 0)
remote: Resolving deltas: 100% (4/4), completed with 1 local object.[K
To https://github.com/sparsh-ai/reco-tut-ffr.git
   9ebae1e..da0d36f  main -> main
