In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
from matplotlib.pyplot import plot as plt


torch.manual_seed(1)
import cleaningtool as ct
from helpers import *

from model import *
from data import *
import sys
import nltk
from nltk.corpus import stopwords

In [None]:
cleaning = True
## Define paths
DATA_FOLDER = './data_2/'
TRAIN_PATH = DATA_FOLDER + 'train.tsv'
TEST_PATH = DATA_FOLDER + 'test.tsv'
VALID_PATH = DATA_FOLDER + 'valid.tsv'

train_data = load_data_(TRAIN_PATH)
test_data = load_data_(TEST_PATH)
valid_data = load_data_(VALID_PATH)

train_data = train_data[["statement","justification", "label"]]
test_data = test_data[["statement","justification", "label"]]
valid_data = valid_data[["statement","justification", "label"]]

### Replacing NAN with an empthy string

In [None]:
train_data.fillna('',inplace=True)
test_data.fillna('',inplace=True)
valid_data.fillna('',inplace=True)

## Merging statement and justification

In [None]:
train_data['input'] = train_data['statement'] + ' ' + train_data['justification'] 
test_data['input'] = test_data['statement'] + ' ' + test_data['justification']
valid_data['input'] = valid_data['statement'] + ' ' + valid_data['justification'] 

In [None]:
train_data = train_data[["input", "label"]]
test_data = test_data[["input", "label"]]
valid_data = valid_data[["input", "label"]]

In [None]:
train_data.rename(columns={'input': 'statement'}, inplace=True)
test_data.rename(columns={'input': 'statement'}, inplace=True)
valid_data.rename(columns={'input': 'statement'}, inplace=True)

In [None]:
df_raw = pd.concat([train_data, test_data, valid_data], axis=0, sort=False).reset_index()

if cleaning == True:
    print("before :-",df_raw["statement"][0])
    train_data = clean_data(train_data,"statement")
    test_data = clean_data(test_data,"statement")
    valid_data = clean_data(valid_data,"statement")
    df_raw = clean_data(df_raw,'statement')
    print()
    print("after :-", df_raw["statement"][0])
    

# sentence to words

In [None]:
df_raw_x, df_raw_y = sent_words(df_raw), sent_words(df_raw,label=True)
x_train, y_train, x_val, y_val = sent_words(train_data), sent_words(train_data,label=True), sent_words(valid_data), sent_words(valid_data,label=True)
x_test, y_test = sent_words(test_data), sent_words(test_data,label=True)

# Merging x and y

In [None]:
x_train = np.array(x_train).reshape(len(x_train),1)
y_train = np.array(y_train).reshape(len(x_train),1)
x_test = np.array(x_test).reshape(len(x_test),1)
y_test = np.array(y_test).reshape(len(x_test),1)
x_val = np.array(x_val).reshape(len(x_val),1)
y_val = np.array(y_val).reshape(len(x_val),1)

train = np.concatenate((x_train,y_train),axis = 1)
val = np.concatenate((x_val,y_val),axis = 1)
test = np.concatenate((x_test,y_test),axis = 1)
data_ = [train,val,test]

# creating word_to_ix and label_to_ix dict and vice versa

In [None]:
word_to_ix = word_to_ix_(df_raw_x)
label_to_ix = label_to_ix_(df_raw_y)

ix_to_word = OrderedDict((v,k) for k,v in word_to_ix.items())
ix_to_label = OrderedDict((v,k) for k,v in label_to_ix.items())
VOCAB_SIZE = len( word_to_ix )

# Breaking them into tokens

In [None]:
train_ = into_token(train,word_to_ix)
test_ = into_token(test,word_to_ix)
val_ = into_token(val,word_to_ix)

In [None]:
total_data_ = np.concatenate((train_,val_),axis=0)

# importing embeddings

In [None]:
import pickle
with open('embed.p', 'rb') as fp:
    embed = OrderedDict(pickle.load(fp))

In [None]:
emb_dim = 300
matrix_len = len(word_to_ix.keys())
weights_matrix = np.zeros((matrix_len, emb_dim))
words_found = 0

for i, word in enumerate(word_to_ix.keys()):
    try: 
        weights_matrix[i] = embed[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
print("words found :-",words_found)

In [None]:
weights_matrix = torch.tensor(weights_matrix)

# Creating Model

In [None]:
bidirectional_ = True
directions_ = 1
if bidirectional_ == True:
    directions_ = 2

# change it's value as per classification task requirement

In [None]:
out_classes = 6

In [None]:
embedding = create_emb_layer(weights_matrix, non_trainable=False)
print("embed layer is trainable or no :----",embedding[0].weight.requires_grad)
model = EncoderRNN(embedding,hidden_size=512,num_layers=1,directions=directions_,bidirectonal=bidirectional_,out=out_classes)

In [None]:
device = torch.device("cuda")
model.to(device)

In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

#training
update_weights = 512
loss_t = []
acc = 0
epoch_ = 15
for epoch in range(epoch_):
    running_loss = 0
    optimizer.zero_grad()
    print("epoch number :",epoch+1)
    for i,(x,y) in enumerate(train_):
        model.train()
        
        h = model.init_hidden(1).to(device)
        y = torch.LongTensor(y).to(device)
        inp = torch.tensor(x.T,dtype = torch.long).to(device)
     
        try:
            out = model(inp,h)

            loss = loss_function(out,y)
            loss.backward()
            loss_t.append(running_loss)
            running_loss += loss.item()

        except: pass

        if i % update_weights == update_weights - 1:    # update weights as defined	
            
            print('[%d, %5d] loss: %.3f' %(epoch + 1, i + 1, running_loss/update_weights ))
            running_loss = 0
            optimizer.step()
            optimizer.zero_grad()

    torch.save(model.state_dict(),"weights/embed_just"+str(epoch)+".pth")
    with torch.no_grad():
        model.eval()
        num = 0
        length = 0
        for i,(x,y) in enumerate(val_):
            h = model.init_hidden(1).to(device)
            y = torch.LongTensor(y).to(device)
            inp = torch.tensor(x.T,dtype = torch.long).to(device)
            try:
                out = model(inp,h)
                out,pred = torch.max(out,1)
                if y == pred.item():
                    num = num+1
                length = length + 1
            except: pass
        accuracy = (num/length)*100
            
        print("accuray while evaluating is :",accuracy ,"%.")
    

In [None]:
#testing
for j in range(epoch_):
    model.load_state_dict(torch.load("weights/embed_just"+str(j)+".pth"), strict = True)
    with torch.no_grad():
            model.eval()
            num = 0
            length = 0
            for i,(x,y) in enumerate(test_):
                h = model.init_hidden(1).to(device)
                y = torch.LongTensor(y).to(device)
                inp = torch.tensor(x.T,dtype = torch.long).to(device)
                try:
                    out = model(inp,h)
                    out,pred = torch.max(out,1)
                    if y == pred.item():
                        num = num+1
                    length = length + 1
                except: pass
            accuracy = (num/length)*100
            print("accuray while evaluating at"+str(j)+" is :",accuracy,"%.")