In [9]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.autograd import Variable
import torch.nn.functional as F
from numpy.fft import fft, ifft
import random
import itertools
import pandas as pd
import tqdm
from random import uniform
import numpy as np
import pandas as pd

## Holographic Embedding Implemetations

class TransE(nn.Module):
    
    def __init__(self,num_entity,num_rel,emb_dim):
        super(TransE,self).__init__()
        self.ent_embeddings=nn.Embedding(num_entity,emb_dim)
        self.rel_embeddings=nn.Embedding(num_rel,emb_dim)
        self.init_weights()
    
    def init_weights(self):
        nn.init.xavier_uniform_(self.ent_embeddings.weight.data)
        nn.init.xavier_uniform_(self.rel_embeddings.weight.data)
    
    def _calc(self,h,t,r):
        return torch.abs(h + r - t)
    
    
    # margin-based loss
    def loss_func(self,p_score,n_score):
        criterion = nn.MarginRankingLoss(1.)
        y = Variable(torch.Tensor([-1]))
        loss = criterion(p_score,n_score,y)
        return loss
    
    def forward(self,pos_inputs,neg_inputs):
        # [batch_size]
        pos_h = pos_inputs[:,0]
        pos_t = pos_inputs[:,1]
        pos_r = pos_inputs[:,2]
        # [batch_size,num_neg_sample]
        neg_h = neg_inputs[:,:,0]
        neg_t = neg_inputs[:,:,1]
        neg_r = neg_inputs[:,:,2]

        # [batch_size,embedding_size]
        pos_h_embed = self.ent_embeddings(pos_h)
        pos_t_embed = self.ent_embeddings(pos_t)
        pos_r_embed = self.rel_embeddings(pos_r)
        # [batch_size,num_neg_sample,embedding_size]
        neg_h_embed = self.ent_embeddings(neg_h)#.view(Batch_Size,-1,emb_dim)
        #print neg_h_embed.size()
        #print neg_h.size()
        neg_t_embed = self.ent_embeddings(neg_t)#.view(Batch_Size,-1,emb_dim)
        neg_r_embed = self.rel_embeddings(neg_r)#.view(Batch_Size,-1,emb_dim)
        pos_score = self._calc(pos_h_embed,pos_t_embed,pos_r_embed)
        neg_score = self._calc(neg_h_embed,neg_t_embed,neg_r_embed)
        #print pos_score
        #print neg_score
        #print torch.mean(pos_score,1).size()
        #print pos_score.size()
        #print neg_score.size()
        #print torch.mean(neg_score, 2).size()
        pos_score = torch.sum(pos_score,1)
        #print torch.mean(neg_score, 1)
        neg_score = torch.sum(torch.mean(neg_score, 1),1)
        #print pos_score
        #print neg_score
        #print neg_score.size()
        #print pos_score.size()
        #neg_score = torch.mean(neg_score,1)
        loss = self.loss_func(pos_score,neg_score)
        return loss
    
    def predict(self, predict_h, predict_t, predict_r):
        pred_h = self.ent_embeddings(Variable(torch.from_numpy(predict_h)))
        pred_t = self.ent_embeddings(Variable(torch.from_numpy(predict_t)))
        pred_r = self.rel_embeddings(Variable(torch.from_numpy(predict_r)))
        p_score  = self._calc(pred_h.view(-1,emb_dim),pred_t.view(-1,emb_dim),pred_r.view(-1,emb_dim))
        p_score = torch.sum(p_score,1)
        return p_score

In [10]:
## Loading the model
Total_Entities = 400
Total_Relations = 4
num_entity = Total_Entities
num_relation = Total_Relations
emb_dim = 15

transe = TransE(num_entity,num_relation,emb_dim)

checkpoint = torch.load("./Data/transe_model.pth.tar",map_location=lambda storage, loc: storage)
transe.load_state_dict(checkpoint['state_dict'])

In [11]:
# prediction function
def transe_prediction(model,fact):
    model.eval()
    i = fact['entity_a']
    j = fact['entity_b']
    score = 100000
    for k in range(4):
        #xt = [i,j,k]
        i = np.array([[i]])
        j = np.array([[j]])
        k = np.array([[k]])
        pred_score = model.predict(i,j,k)
        #print pred_score
        if score > pred_score.data:
            prediction = k
            score = pred_score.data
    return score, prediction

In [12]:
## Training Data Stats

train_data =  pd.read_csv('./Data/train.csv', delimiter=',')
train_data.dropna(inplace=True)
train_data['transe_prediction'] = ''
# Calculating train data stats
for idx,dat in  tqdm.tqdm_notebook(enumerate(train_data.iterrows())):
    _,train_data['transe_prediction'].iloc[idx] = transe_prediction(transe,dat[1])

acc_count = (train_data['transe_prediction'] == train_data['relation']).sum()
acc = float(acc_count)/train_data.shape[0]

# Printing results
print("\n- - - - - - - - - - - - - STATISTICS ON TRAINING DATASET - - - - - - - - - - - - - - \n")
print("Total number of Green links in training dataset : {} ".format(train_data.shape[0]))
print(" Number of Green predicted correctly: {}".format(acc_count))
print(" Accuracy of Green predicted correctly: {}".format(acc))

## Validation Data Stats

valid_data =  pd.read_csv('./Data/valid.csv', delimiter=',')
valid_data.dropna(inplace=True)
valid_data['transe_prediction'] = ''
# Calculating train data stats
for idx,dat in  tqdm.tqdm_notebook(enumerate(valid_data.iterrows())):
    _,valid_data['transe_prediction'].iloc[idx] = transe_prediction(transe,dat[1])

acc_count = (valid_data['transe_prediction'] == valid_data['relation']).sum()
acc = float(acc_count)/valid_data.shape[0]

# Printing results
print("\n- - - - - - - - - - - - - STATISTICS ON VALIDATION DATASET - - - - - - - - - - - - - - \n")
print("Total number of Green links in training dataset : {} ".format(valid_data.shape[0]))
print(" Number of Green predicted correctly: {}".format(acc_count))
print(" Accuracy of Green predicted correctly: {}".format(acc))

## To_use Data Stats

test_data =  pd.read_csv('./Data/to_use.csv', delimiter=',')
test_data.dropna(inplace=True)
test_data['transe_prediction'] = ''
# Calculating train data stats
for idx,dat in  tqdm.tqdm_notebook(enumerate(test_data.iterrows())):
    _,test_data['transe_prediction'].iloc[idx] = transe_prediction(transe,dat[1])

acc_count = (test_data['transe_prediction'] == test_data['relation']).sum()
acc = float(acc_count)/test_data.shape[0]

# Printing results
print("\n- - - - - - - - - - - - - STATISTICS ON TO_USE DATASET - - - - - - - - - - - - - - \n")
print("Total number of Green links in training dataset : {} ".format(test_data.shape[0]))
print(" Number of Green predicted correctly: {}".format(acc_count))
print(" Accuracy of Green predicted correctly: {}".format(acc))

A Jupyter Widget

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)




- - - - - - - - - - - - - STATISTICS ON TRAINING DATASET - - - - - - - - - - - - - - 

Total number of Green links in training dataset : 384 
 Number of Green predicted correctly: 383
 Accuracy of Green predicted correctly: 0.997395833333


A Jupyter Widget



- - - - - - - - - - - - - STATISTICS ON VALIDATION DATASET - - - - - - - - - - - - - - 

Total number of Green links in training dataset : 126 
 Number of Green predicted correctly: 88
 Accuracy of Green predicted correctly: 0.698412698413


A Jupyter Widget



- - - - - - - - - - - - - STATISTICS ON TO_USE DATASET - - - - - - - - - - - - - - 

Total number of Green links in training dataset : 381 
 Number of Green predicted correctly: 266
 Accuracy of Green predicted correctly: 0.698162729659


In [13]:
from sklearn.externals import joblib

In [None]:
# Load the model from the file
clf_from_joblib = joblib.load('filename.pkl') 