In [21]:
import sklearn as sk
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import norm
from sklearn.metrics.pairwise import cosine_similarity as cosine
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
def diff_gaussian(year1, year2, var=3.408):
    return norm(0,var).pdf(abs(year2-year1))*(1/norm(0,var).pdf(0))

In [3]:
data = pd.read_csv('~/datamining/DBLP_labeled_data .tsv', sep='\t')

In [4]:
data.head()

Unnamed: 0,Author name,Unique Author ID,Paper ID,Coauthors,Year,Venue,Title
0,ajay k. gupta,agupta-10,1543,ajay k. gupta|kurt maly|irwin b. levinstein|ra...,1996,ifip world conference on it tools,pme privileg mangag enforc system secur distri...
1,ajay k. gupta,agupta-10,2598,ajay k. gupta|kurt maly|hussein m. abdel-wahab...,1997,webnet,coprocess java base environ collabor process m...
2,ajay k. gupta,agupta-10,2613,ajay k. gupta|kurt maly|bjorn kvande|irwin b. ...,1996,wetice,privileg manag enforc system distribut resourc...
3,ajay k. gupta,agupta-10,2610,ajay k. gupta|hussein m. abdel-wahab|kurt maly...,1996,wetice,softwar architectur interprocess commun iri in...
4,ajay k. gupta,agupta-10,2872,ajay k. gupta|kurt maly|hussein m. abdel-wahab...,1995,computer networks and isdn systems,mosaic xtv coreview


In [5]:
title_list = [i for i in data["Title"]]

In [6]:
vectorizer = TfidfVectorizer()

In [7]:
title_vectors = vectorizer.fit_transform(title_list)

In [8]:
print(title_vectors[0])

  (0, 2846)	0.26054009235401854
  (0, 2665)	0.2805216177531512
  (0, 884)	0.20430256436375643
  (0, 2802)	0.2774081295584946
  (0, 3113)	0.1524187244889741
  (0, 999)	0.40088171371412695
  (0, 1812)	0.43504035700237015
  (0, 2439)	0.41505883160323753
  (0, 2357)	0.43504035700237015


In [9]:
venue_list = [i for i in data["Venue"]]

In [10]:
venue_list[0]

'ifip world conference on it tools'

In [11]:
venue_vectors = vectorizer.fit_transform(venue_list)

In [12]:
print(venue_vectors[0])

  (0, 902)	0.3976181502067804
  (0, 525)	0.5208270698540904
  (0, 672)	0.2693809335040437
  (0, 159)	0.271173479694462
  (0, 975)	0.46843142967163187
  (0, 445)	0.4528905190043382


In [13]:
author_list = [i for i in data["Coauthors"]]

In [14]:
author_list = [i.replace('|',' ') for i in author_list]

In [15]:
author_list = [i.replace('.','') for i in author_list]

In [16]:
author_list[0]

'ajay k gupta kurt maly irwin b levinstein ravi mukkamala bjorn kvande s nanjangud margrethe h olson roy whitney rita chambers'

In [17]:
author_vectors = vectorizer.fit_transform(author_list)

In [18]:
X = []
y = []
for i in range(1000):
    if i%100==0:
            print(i)
    for j in range(i):
        if data.loc[i,"Unique Author ID"] == data.loc[j, "Unique Author ID"]:
            y.append(1)
        else:
            y.append(0)
        similarity_vector = []
        similarity_vector.append(cosine(author_vectors[i], author_vectors[j])[0][0])
        similarity_vector.append(cosine(title_vectors[i], title_vectors[j])[0][0])
        similarity_vector.append(cosine(venue_vectors[i], venue_vectors[j])[0][0])
        similarity_vector.append(diff_gaussian(data["Year"][i], data["Year"][j]))
        X.append(similarity_vector)

0
100
200
300
400
500
600
700
800
900


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)
clf = LR(solver='liblinear', penalty='l1', multi_class='ovr', n_jobs=2)
clf.fit(X_train, y_train)
y_ans = clf.predict(X_test)
print(classification_report(y_test, y_ans))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99     96385
           1       0.76      0.49      0.59      3515

    accuracy                           0.98     99900
   macro avg       0.87      0.74      0.79     99900
weighted avg       0.97      0.98      0.97     99900



In [36]:
X_0 = [[i[1],i[2],i[3]] for i in X]

In [38]:
X_1 = [[i[0],i[2],i[3]] for i in X]

In [39]:
X_2 = [[i[0],i[1],i[3]] for i in X]

In [40]:
X_3 = [[i[0],i[1],i[2]] for i in X]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_0, y, test_size=0.2, random_state=51)
clf = LR(solver='liblinear', penalty='l1', multi_class='ovr', n_jobs=2)
clf.fit(X_train, y_train)
y_ans = clf.predict(X_test)
print(classification_report(y_test, y_ans))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98     96385
           1       0.65      0.06      0.10      3515

    accuracy                           0.97     99900
   macro avg       0.81      0.53      0.54     99900
weighted avg       0.96      0.97      0.95     99900



In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.2, random_state=51)
clf = LR(solver='liblinear', penalty='l1', multi_class='ovr', n_jobs=2)
clf.fit(X_train, y_train)
y_ans = clf.predict(X_test)
print(classification_report(y_test, y_ans))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99     96385
           1       0.74      0.47      0.58      3515

    accuracy                           0.98     99900
   macro avg       0.86      0.73      0.78     99900
weighted avg       0.97      0.98      0.97     99900



In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y, test_size=0.2, random_state=51)
clf = LR(solver='liblinear', penalty='l1', multi_class='ovr', n_jobs=2)
clf.fit(X_train, y_train)
y_ans = clf.predict(X_test)
print(classification_report(y_test, y_ans))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99     96385
           1       0.76      0.49      0.59      3515

    accuracy                           0.98     99900
   macro avg       0.87      0.74      0.79     99900
weighted avg       0.97      0.98      0.97     99900



In [44]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y, test_size=0.2, random_state=51)
clf = LR(solver='liblinear', penalty='l1', multi_class='ovr', n_jobs=2)
clf.fit(X_train, y_train)
y_ans = clf.predict(X_test)
print(classification_report(y_test, y_ans))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99     96385
           1       0.76      0.49      0.59      3515

    accuracy                           0.98     99900
   macro avg       0.87      0.74      0.79     99900
weighted avg       0.97      0.98      0.97     99900



In [85]:
def check_venue(u,v):
	if data.loc[u,"Venue"]==data.loc[v,"Venue"]:
		return True
	return False

def check_coauthors(u,v):
	u_co=data.loc[u,"Coauthors"]
	v_co=data.loc[v,"Coauthors"]
	for i in u_co:
		if i not in v_co:
			return False
	return True

def check_name(u,v):
	u_author=data.loc[u,"Author name"]
	v_author=data.loc[v,"Author name"]
	return u_author==v_author

def isPositive(u,v):
    if check_venue(u,v) and check_coauthors(u,v) and check_name(u,v):
        return True
    return False

def Find(par,u):
    if par[u]==u:
        return u
    par[u]=Find(par,par[u])
    return par[u]

def Union(par,u,v):
	u=Find(par,u)
	v=Find(par,v)
	if u!=v:
		par[u]=v

new_y=[]
par=[i for i in range(1000)]
for i in range(1000):
    for j in range(i):
        if isPositive(i,j):
            Union(par,i,j)

neg=[]
index=0
n=0
for i in range(1000):
    for j in range(i):
        if Find(par,i)==Find(par,j):
            new_y.append(1)
        else:
            n+=1
            new_y.append(0)

p=len(new_y)-n
extra=0
if n>p:
    extra=n-p

In [89]:
#print(p,n)
#print(extra)
X_new = []
y_new=[]
print(len(X))
for i in range(len(X)):
    if new_y[i]==0 and extra!=0:
        extra-=1
    elif new_y[i]==0:
        y_new.append(0)
        X_new.append(X[i])
    else:
        y_new.append(1)
        X_new.append(X[i])

249949


In [80]:
y_new.count(1),y_new.count(0)

(397, 0)

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, new_y, test_size=0.2, random_state=51)
clf = LR(solver='liblinear', penalty='l1', multi_class='ovr')
clf.fit(X_train, y_train)
y_ans = clf.predict(X_test)
print(classification_report(y_test, y_ans))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     99808
           1       0.64      0.48      0.55        92

    accuracy                           1.00     99900
   macro avg       0.82      0.74      0.77     99900
weighted avg       1.00      1.00      1.00     99900



In [73]:
for i in range(1000):
    if i%100==0:
            print(i)
    for j in range(i):
        if data.loc[i,"Unique Author ID"] == data.loc[j, "Unique Author ID"]:
            y.append(1)
        else:
            y.append(0)

0.9992692692692693