## Training Logistic Regression Model Using Trained Embeddings

In [1]:
# Loading model
def get_model():
    import pickle
    import os
    os.chdir(r'/home/qxl/')
    
    # Loading trained embeddings
    dim = 32
    length = 40
    method = 'deepwalk'
    directed = False
    emb = f'embedding/dim{dim}-len{length}/{"directed_" if directed else ""}{method}_full.pkl'
    model = pickle.load(open(emb, 'rb'))
    return model

In [2]:
# Generating input vectors
def get_datavec(model):
    
    mapping = model.graph.name2id
    context_emb = model.solver.context_embeddings
    vertex_emb = model.solver.vertex_embeddings
    
    # Reading file and generating input vectors
    file_name = r'data/test2.txt'
    X, Y = [], []
    with open(file_name, 'r') as f:
        for line in f:
            h, t, y = line.split()
            X_vec = list(vertex_emb[mapping[h]])
            X_vec.extend(context_emb[mapping[t]])
            X.append(X_vec)
            Y.append(int(y))
    return X, Y

In [3]:
# Generating test vectors
def get_test_datavec(model):

    mapping = model.graph.name2id
    context_emb = model.solver.context_embeddings
    vertex_emb = model.solver.vertex_embeddings
    
    # Reading file and generating input vectors
    file_name = r'data/test-public.txt'
    X  = []
    with open(file_name, 'r') as f:
        next(f)
        for line in f:
            _, h, t = line.split()
            X_vec = list(vertex_emb[mapping[h]])
            X_vec.extend(context_emb[mapping[t]])
            X.append(X_vec)
    return X

In [4]:
from sklearn.model_selection import train_test_split
import pickle
import pandas as pd
from sklearn.metrics import classification_report

# Loading full data
model = get_model()
X, Y =  get_datavec(model)
# Spliting data into training set and testing set
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.2)

In [5]:
# Changing data to ndarray format
import numpy as np
Xtrain = np.array(Xtrain)
Xtest = np.array(Xtest)

### Logistic Regression Model

In [6]:
# Setting LR model and training the model
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver = 'sag')
log_reg.fit(Xtrain, Ytrain)

LogisticRegression(solver='sag')

In [7]:
# Making prediction on the testing set
prediction = log_reg.predict(Xtest)

In [12]:
# Evaluating the model
report = classification_report(Ytest, prediction, zero_division = 0, digits = 4)

from sklearn.metrics import roc_curve, auc, roc_auc_score
false_positive_rate, true_positive_rate, thresholds = roc_curve(Ytest, prediction)
auc = auc(false_positive_rate, true_positive_rate)

report += ('auc = ' + str(auc))
print(report)

              precision    recall  f1-score   support

           0     0.9986    0.9965    0.9976    734726
           1     0.9969    0.9988    0.9979    831841

    accuracy                         0.9977   1566567
   macro avg     0.9978    0.9977    0.9977   1566567
weighted avg     0.9977    0.9977    0.9977   1566567
auc = 0.9976653860278812


In [13]:
# Saving report
results_path = 'eval/results/Logistic_Reg.txt'
with open(results_path, 'w') as results_f:
    results_f.write(report)
results_f.close()

In [14]:
# Saving LR classifier
pickle.dump(log_reg, open('eval/classifiers/Logitic_Reg.pkl', 'wb'))

### Generating Prediction for Kaggle Competition

In [19]:
import math
# Making prediction for Kaggle test set
Kaggle_test = np.array(get_test_datavec(model))
# Making prediction on the testing set
Kaggle_prediction = log_reg.decision_function(Kaggle_test)
est_prob = []
for i in range(len(Kaggle_prediction)):
    est_prob.append(1/(1 + math.exp(-Kaggle_prediction[i])))

In [21]:
# Saving the output
out = f'output/logistic_reg_score.csv'
with open(out, 'w') as o:
    o.write('Id,Predicted\n')
    for i, s in enumerate(est_prob):
        o.write(f'{i+1},{s}\n')
print('Write output to', out)

Write output to output/logistic_reg_score.csv
