In [1]:
%matplotlib inline
import os
import sys
import matplotlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import cmath
import pickle
import time
from collections import defaultdict

from datetime import datetime, date

from sktensor import dtensor, cp_als
# matplotlib.style.use('ggplot')

from sklearn.utils.extmath import randomized_svd
from sklearn.metrics import precision_recall_curve, average_precision_score
from sktensor import dtensor, cp_als

matplotlib.style.use('ggplot')

### Create the tensor

In [2]:
df = pickle.load( open( "dblp_inproceeding_10.p", "rb" ) )

### Choose only 10 years

In [3]:
df_train = df[df.year>=1991]
df_train = df[df.year<=2000]

In [4]:
df_test = df[df.year==2001]

In [5]:
df_train.shape, df_test.shape

((572596, 3), (64233, 3))

### More than 20 publications

In [6]:
author_list = pd.unique(df_train.iloc[:,0])

In [7]:
# Get the indexes of each author

temp = time.time()
count = 0

authors_indexes = [(element, index) for index, element in enumerate(df_train.author)]
author_index = defaultdict(list)

for k, v in authors_indexes:
    if count % 10000 == 1:
        print('%d/%d lines treated - %.0f seconds elapsed') %(count, len(author_list), time.time() - temp)
  
    author_index[k].append(v)
    
    count += 1

1/53454 lines treated - 0 seconds elapsed
10001/53454 lines treated - 0 seconds elapsed
20001/53454 lines treated - 0 seconds elapsed
30001/53454 lines treated - 1 seconds elapsed
40001/53454 lines treated - 1 seconds elapsed
50001/53454 lines treated - 1 seconds elapsed
60001/53454 lines treated - 1 seconds elapsed
70001/53454 lines treated - 1 seconds elapsed
80001/53454 lines treated - 1 seconds elapsed
90001/53454 lines treated - 1 seconds elapsed
100001/53454 lines treated - 1 seconds elapsed
110001/53454 lines treated - 1 seconds elapsed
120001/53454 lines treated - 1 seconds elapsed
130001/53454 lines treated - 1 seconds elapsed
140001/53454 lines treated - 1 seconds elapsed
150001/53454 lines treated - 1 seconds elapsed
160001/53454 lines treated - 1 seconds elapsed
170001/53454 lines treated - 1 seconds elapsed
180001/53454 lines treated - 1 seconds elapsed
190001/53454 lines treated - 1 seconds elapsed
200001/53454 lines treated - 1 seconds elapsed
210001/53454 lines treated 

In [8]:
# Keep the authors with more than 20 publications
author_index_sorted = []

for author in author_list : 
    if len(author_index[author]) > 20 :
        author_index_sorted += author_index[author]

In [9]:
df_train = df_train[df_train.index.isin(author_index_sorted)]

In [10]:
df_test = df_test[df_test.author.isin(df_train.author)]
df_test = df_test[df_test.crossref.isin(df_train.crossref)]

In [11]:
df_train.shape, df_test.shape

((122244, 3), (4821, 3))

In [12]:
del df

### Create the training tensor

In [13]:
author_list = pd.unique(df_train.iloc[:, 0])
conf_list = pd.unique(df_train.iloc[:, 2])
year_list = pd.unique(df_train.iloc[:, 1])

year_list.sort()

author_dic = {element : index for index, element in enumerate(author_list)}
conf_dic =   {element : index for index, element in enumerate(conf_list)}
year_dic =   {element : index for index, element in enumerate(year_list)}

T_train = np.zeros((len(author_list), len(conf_list), len(year_list)))
T_train = dtensor(T_train)

In [14]:
len(author_list)

39827

In [15]:
# Going through the dataframe
t_temp = time.time()
i = 0

L = len(zip(df_train.author, df_train.crossref, df_train.year))

for a, c, y in zip(df_train.author, df_train.crossref, df_train.year):
    if i % 100000 == 1:
        print('%d/%d (df.author, df.crossref, df.year) treated - %.0f seconds elapsed') %(i, L, time.time() - t_temp)
        
    # Finding the corresponding index in the tensor
    a_ind = author_dic[a]
    c_ind = conf_dic[c]
    y_ind = year_dic[y]
    
    # Modifying the tensor value for the tuple (i_ind, j_ind, k_ind)
    T_train[a_ind, c_ind, y_ind] += 1
    
    i +=1

print time.time()-t_temp    

1/122244 (df.author, df.crossref, df.year) treated - 0 seconds elapsed
100001/122244 (df.author, df.crossref, df.year) treated - 1 seconds elapsed
1.00237202644


In [16]:
# Logarithmic Transformation
nonz = T_train.nonzero()
for ind in range(len(nonz[0])):
    i_ind = nonz[0][ind]
    j_ind = nonz[1][ind] 
    k_ind = nonz[2][ind]
    
    T_train[i_ind, j_ind, k_ind] = 1 + np.log(T_train[i_ind, j_ind, k_ind]) 

In [17]:
del year_dic, df_train

### Create the test tensor

In [18]:
author_list_train = author_list
conf_list_train = conf_list
author_dic_train = author_dic
conf_dic_train = conf_dic

In [19]:
author_list = pd.unique(df_test.iloc[:, 0])
conf_list = pd.unique(df_test.iloc[:, 2])

author_dic = {element : index for index, element in enumerate(author_list)}
conf_dic =   {element : index for index, element in enumerate(conf_list)}

T_test = np.zeros((len(author_list_train), len(conf_list_train)))

In [20]:
# Going through the dataframe
t_temp = time.time()
i = 0

L = len(zip(df_test.author, df_test.crossref))

for a, c in zip(df_test.author, df_test.crossref):
    if i % 1000 == 1:
        print('%d/%d (df.author, df.crossref, df.year) treated - %.0f seconds elapsed') %(i, L, time.time() - t_temp)

    # Finding the corresponding index in the tensor
    a_ind = author_dic_train[a]
    c_ind = conf_dic_train[c]

    # Modifying the tensor value for the tuple (i_ind, j_ind, k_ind)
    T_test[a_ind, c_ind] = 1

    i +=1

print time.time()-t_temp    

1/4821 (df.author, df.crossref, df.year) treated - 0 seconds elapsed
1001/4821 (df.author, df.crossref, df.year) treated - 0 seconds elapsed
2001/4821 (df.author, df.crossref, df.year) treated - 0 seconds elapsed
3001/4821 (df.author, df.crossref, df.year) treated - 0 seconds elapsed
4001/4821 (df.author, df.crossref, df.year) treated - 0 seconds elapsed
0.261805057526


In [21]:
del author_list_train, conf_list_train, author_list, conf_list, author_dic, conf_dic, df_test

### Collapsed weighted tensor

In [22]:
CT = np.zeros((T_train[:,:,0].shape[0], T_train[:,:,0].shape[1]))
theta = 0.2
K = T_train.shape[2]

for k in range(T_train.shape[2]):
    CT = CT + ((1-theta)**(K - k))*T_train[:,:,k]

In [23]:
# Create a np.array matrix instead of the sklearn tensor
C = np.zeros((T_train[:,:,0].shape[0], T_train[:,:,0].shape[1]))

In [24]:
for i in range(CT.shape[0]) :
    for j in range(CT.shape[1]) :
        C[i,j] = CT[i,j]

In [25]:
CT = C
del C

In [26]:
CT.shape

(39827, 1457)

### Adjacency matrix

In [27]:
above_adj_matrix = np.concatenate((np.zeros((CT.shape[0],CT.shape[0])),CT), axis = 1)

In [28]:
above_adj_matrix.shape

(39827, 41284)

In [29]:
below_adj_matrix = np.concatenate((CT.transpose(), np.zeros((CT.shape[1],CT.shape[1]))), axis = 1)

In [30]:
adj_matrix = np.concatenate((above_adj_matrix, below_adj_matrix), axis = 0)

In [31]:
adj_matrix.shape

(41284, 41284)

In [32]:
del above_adj_matrix, below_adj_matrix

### Adjacency matrix

In [33]:
U, Sigma, Vt = randomized_svd(CT, n_components=15, n_iter=5, random_state=None)

In [34]:
gamma = []

In [35]:
beta = 0.001

In [36]:
for i in range(len(Sigma)) :
    gamma += [1/(1-beta*Sigma[i]) - 1]

In [37]:
CT_SVD = np.dot(np.dot(U,np.diag(gamma)),Vt)

### Katz score

In [38]:
S = np.concatenate((np.zeros((CT_SVD.shape[0], CT_SVD.shape[0])), CT_SVD), axis = 1)

In [39]:
S = np.concatenate((S,np.concatenate((CT_SVD.transpose(), np.zeros((CT_SVD.shape[1],CT_SVD.shape[1]))), axis = 1)), axis = 0)

In [40]:
S

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          1.59264018e-08,   2.85687424e-09,   2.56362229e-09],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          5.60121077e-09,   6.71010489e-09,   1.26121270e-08],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          7.44229876e-09,   8.91796242e-09,   8.13899854e-09],
       ..., 
       [  1.59264018e-08,   5.60121077e-09,   7.44229876e-09, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.85687424e-09,   6.71010489e-09,   8.91796242e-09, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.56362229e-09,   1.26121270e-08,   8.13899854e-09, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [41]:
S.shape

(41284, 41284)

In [42]:
S.min()

-0.00013575868010089808

In [43]:
S.max()

0.0056649002616706284

In [44]:
#Converting S into S_pred, a binary matrix
#thres = 1  # If the score above thres, we predict an email

#S_pred = (1*(S >= thres))
#S_test = (1*np.array(T_test[:, :, 0] >= 1))

### Precision-Recall

In [None]:
#beta_list = [0.001, 0.01, 0.1, 0.3, 0.5, 0.8]
beta= 0.001

# Katz Scoring
y_score = 1*(S >= 1).flatten()
y_test = T_test.flatten()

# Precision-Recall
precision, recall, _ = precision_recall_curve(y_test,y_score)
average_precision = average_precision_score(y_test, y_score)    

# Plotting the results
plt.clf()
plt.figure(figsize=(16, 10))

plt.plot(recall, precision, label='AUC={0:0.2f} Beta=%f' .format(average_precision) %(beta))

plt.axhline(mail_rate, label='random', color='black')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.legend(loc="upper right")
plt.title('Precision-Recall for Katz \nCollapsed Weighted Tensor')
plt.savefig('cp_roc_cut_%d.png' %(tau))     

### TKatz

In [None]:
# Rank of the adjacency matrix
#np.linalg.matrix_rank(CT)

In [None]:
# Eigendecomposition
#D, W = np.linalg.eig(Adj_matrix)

In [None]:
#print W.shape, D.shape

In [None]:
# Check orthogonality
#P = np.dot(np.linalg.inv(W),W)
#np.allclose(P, np.eye(len(P)))

In [None]:
# Only real numbers
#count = 0
#for i in D :
#    if i.imag == 0 :
#        count += 1

#count

In [None]:
#count = 0
#for i in range(1,len(D)) :
#    if abs(D[i-1]) >= abs(D[i]) :
#        count += 1

#print count

In [None]:
#for value in range(len(D)) :
#    D[value] = D[value].real

In [None]:
#alpha = []
#for i in D :
#    alpha += [1./(1-i)] 

In [None]:
#TKatzScore = np.dot(W,np.dot(np.diag(alpha),W.transpose()))

In [None]:
#for value in range(len(TKatzScore)) :
#    TKatzScore[value] = TKatzScore[value].real

In [None]:
# Only real numbers
#count = 0
#for i in range(TKatzScore.shape[0]) :
#    for j in range(TKatzScore.shape[1]) :
#        if TKatzScore[i,j].imag == 0 :
#            count += 1

#count

In [None]:
#TKatzScore.shape[0]*TKatzScore.shape[1]

In [None]:
#TKatzScore.min()

In [None]:
#TKatzScore.max()

In [None]:
#beta_list = [0.001, 0.01, 0.1, 0.3, 0.5, 0.8]
#precision = dict()
#recall = dict()
#average_precision = dict()

#for beta in beta_list :
#    alpha = []
#    for i in D :
#        alpha += [1./(1-beta*i) - 1] 
#    # TKatz Scoring
#    S = np.dot(W,np.dot(np.diag(alpha),W.transpose()))
#    y_score = S.flatten()
#    y_test = S_test.flatten()    
#    # Precision-Recall
#    precision[beta], recall[beta], _ = precision_recall_curve(y_test,
#                                                        y_score)
#    average_precision[beta] = average_precision_score(y_test, y_score)    

#    # Plotting the results
#    plt.clf()
#    plt.figure(figsize=(16, 10))
    
#for beta in beta_list :
#    plt.plot(recall[beta], precision[beta], label='AUC={0:0.2f} Beta=%f' .format(average_precision[beta]) %(beta))

#plt.axhline(mail_rate, label='random', color='black')
#plt.xlabel('Recall')
#plt.ylabel('Precision')
#plt.ylim([0.0, 1.05])
#plt.xlim([0.0, 1.0])
#plt.legend(loc="upper right")
#plt.title('Precision-Recall for TKatz \nCollapsed Weighted Tensor')
#plt.savefig('cp_roc_cut_%d.png' %(tau))     