In [3]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.utils import shuffle
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import torch.optim as optim

In [6]:
from dedup_lib.datasets import dedupDataset
from dedup_lib.models import NN

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


In [8]:
# Util Function

def binary_acc(y_pred, y_ground):

    # print(y_pred, y_ground)  
    y_pred_tag = torch.round(y_pred)
    # print(y_pred_tag, y_ground)  


    correct_results_sum = (y_pred_tag == y_ground).sum().float()
    # print(correct_results_sum)
    acc = correct_results_sum/y_ground.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

## Data Preparation
### Remove all data with entries which do not have embeddings

### Read Glove Embedding Model

The glove model weights have to be downloaded from https://nlp.stanford.edu/projects/glove/


Update the path in GLOVE_DIR accordingly

In [10]:
GLOVE_DIR="../pre-trained-models/glove.6B/"
dimension=100
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

### Read Data

In [11]:
data_path = "../data-extraction/1AllDuplicates_5NoDuplicates.csv"

df = pd.read_csv(data_path, usecols=['w1', 'w2', 'isDuplicate']).sample(frac=1).reset_index(drop=True)

df['w1'] = df['w1'].astype(str)
df['w2'] = df['w2'].astype(str)

### Remove words without embeddings

In [12]:
def hasEmbedding(w):
    if w.lower().strip() in embeddings_index:
        return True
    else:
        return False

df['w1_has_embedding']=df['w1'].apply(lambda x:hasEmbedding(x))
df['w2_has_embedding']=df['w2'].apply(lambda x:hasEmbedding(x))

In [13]:
df = df[df['w1_has_embedding']==True]
df = df[df['w2_has_embedding']==True]

df = df.drop(['w1_has_embedding', 'w2_has_embedding'], axis=1)
df = df.reset_index(drop=True)

In [14]:
df_train = df[:int(0.7*len(df))].reset_index(drop=True)
df_test = df[int(0.7*len(df)):].reset_index(drop=True)

# Train pytorch NN

In [15]:
train_set, test_set = dedupDataset(df_train, embeddings_index), dedupDataset(df_test, embeddings_index)
train_loader = DataLoader(train_set, batch_size=100)
test_loader = DataLoader(test_set, batch_size=len(df_test))

In [16]:
model = NN(d=dimension).to(device)

In [17]:
# create your optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)
loss_fn = nn.BCELoss()

### Train Model

In [18]:
epochs=75
#forward loop
losses = []
accur = []
for i in range(epochs):
    for j,(x_train,y_train) in enumerate(train_loader):
        x_train,y_train = x_train.to(device), y_train.to(device)
        #calculate output
        output = model(x_train)

        #calculate loss
        loss = loss_fn(output,y_train.reshape(-1,1))

        acc = binary_acc(output, y_train.reshape(-1,1))
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if i%5 == 0:
        losses.append(loss)
        accur.append(acc)
        print("epoch {}\tloss : {}\t accuracy : {}".format(i,loss,acc))



epoch 0	loss : 0.6001343727111816	 accuracy : 86.0
epoch 5	loss : 0.3651868999004364	 accuracy : 86.0
epoch 10	loss : 0.29831990599632263	 accuracy : 86.0
epoch 15	loss : 0.23527204990386963	 accuracy : 86.0
epoch 20	loss : 0.16881319880485535	 accuracy : 98.0
epoch 25	loss : 0.11710987240076065	 accuracy : 99.0
epoch 30	loss : 0.08522161841392517	 accuracy : 99.0
epoch 35	loss : 0.06379342824220657	 accuracy : 99.0
epoch 40	loss : 0.04789978265762329	 accuracy : 99.0
epoch 45	loss : 0.03635437786579132	 accuracy : 99.0
epoch 50	loss : 0.02817995660007	 accuracy : 100.0
epoch 55	loss : 0.02230760268867016	 accuracy : 100.0
epoch 60	loss : 0.01765982061624527	 accuracy : 100.0
epoch 65	loss : 0.014102832414209843	 accuracy : 100.0
epoch 70	loss : 0.011316223070025444	 accuracy : 100.0


### Test Model

In [19]:
y_truth_list = []
y_pred_list = []
model.eval()
with torch.no_grad():
    for j,(x_test,y_test) in enumerate(test_loader):
        x_test,y_test = x_test.to(device), y_test.to(device)
        #calculate output
        output = model(x_test)

        acc = binary_acc(output, y_test.reshape(-1,1))

In [20]:
acc

tensor(97., device='cuda:0')

In [21]:
y_pred = torch.round(output.reshape(-1))

In [22]:
y_pred_arr = np.asarray(y_pred.cpu())
y_test_arr = np.asarray(y_test.cpu())

### Stats about results

In [23]:
from sklearn.metrics import roc_curve, classification_report
from utils import makeCFwithStats

ModuleNotFoundError: No module named 'utils'

In [None]:
print(classification_report(y_test_arr, y_pred_arr))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test_arr, y_pred_arr)
plt.plot(fpr,tpr)
plt.show()

In [None]:
makeCFwithStats(y_test_arr, y_pred_arr)