In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
from matplotlib.pyplot import plot as plt


torch.manual_seed(1)
import cleaningtool as ct
from helpers import *

from model import *
from data import *
import sys
import nltk
from nltk.corpus import stopwords

In [2]:
cleaning = True
## Define paths
DATA_FOLDER = './data_1/'
TRAIN_PATH = DATA_FOLDER + 'train.tsv'
TEST_PATH = DATA_FOLDER + 'test.tsv'
VALID_PATH = DATA_FOLDER + '/valid.tsv'

train_data = load_data(TRAIN_PATH)
test_data = load_data(TEST_PATH)
valid_data = load_data(VALID_PATH)


train_data = train_data[["statement", "label"]]
test_data = test_data[["statement", "label"]]
valid_data = valid_data[["statement", "label"]]

df_raw = pd.concat([train_data, test_data, valid_data], axis=0, sort=False).reset_index()

In [3]:
if cleaning == True:
    print("before :-",df_raw["statement"][0])
    train_data = clean_data(train_data,"statement")
    test_data = clean_data(test_data,"statement")
    valid_data = clean_data(valid_data,"statement")
    df_raw = clean_data(df_raw,'statement')
    print()
    print("after :-", df_raw["statement"][0])


before :- Says the Annies List political group supports third-trimester abortions on demand.

after :- say list political group support abortion demand


# sentence to words

In [4]:
df_raw_x, df_raw_y = sent_words(df_raw), sent_words(df_raw,label=True)
x_train, y_train, x_val, y_val = sent_words(train_data), sent_words(train_data,label=True), sent_words(valid_data), sent_words(valid_data,label=True)
x_test, y_test = sent_words(test_data), sent_words(test_data,label=True)

In [5]:
x_train = np.array(x_train).reshape(len(x_train),1)
y_train = np.array(y_train).reshape(len(x_train),1)
x_test = np.array(x_test).reshape(len(x_test),1)
y_test = np.array(y_test).reshape(len(x_test),1)
x_val = np.array(x_val).reshape(len(x_val),1)
y_val = np.array(y_val).reshape(len(x_val),1)

train = np.concatenate((x_train,y_train),axis = 1)
val = np.concatenate((x_val,y_val),axis = 1)
test = np.concatenate((x_test,y_test),axis = 1)
data_ = [train,val,test]

In [6]:
word_to_ix = word_to_ix_(df_raw_x)
label_to_ix = label_to_ix_(df_raw_y)

In [7]:
len(word_to_ix),len(label_to_ix)

(3199, 6)

In [8]:
ix_to_word = OrderedDict((v,k) for k,v in word_to_ix.items())
ix_to_label = OrderedDict((v,k) for k,v in label_to_ix.items())
VOCAB_SIZE = len( word_to_ix )
NUM_LABELS = len(set(label_to_ix.keys()))

In [9]:
train_ = into_token(train,word_to_ix)
test_ = into_token(test,word_to_ix)
val_ = into_token(val,word_to_ix)

In [10]:
total_data_ = np.concatenate((train_,val_),axis=0)

In [11]:
import pickle
with open('embed.p', 'rb') as fp:
    embed = OrderedDict(pickle.load(fp))

In [12]:
emb_dim = 300
matrix_len = len(word_to_ix.keys())
weights_matrix = np.zeros((matrix_len, emb_dim))
words_found = 0

for i, word in enumerate(word_to_ix.keys()):
    try: 
        weights_matrix[i] = embed[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
print("words found :-",words_found)

words found :- 3142


In [13]:
weights_matrix = torch.tensor(weights_matrix)

In [14]:
bidirectional_ = True
directions_ = 1

if bidirectional_ == True:
    directions_ = 2

# change it's value as per classification task requirement

In [15]:
out_classes = 6

In [16]:
embedding = create_emb_layer(weights_matrix, non_trainable=False)
print("embed layer is trainable or no :----",embedding[0].weight.requires_grad)
model = EncoderRNN(embedding,hidden_size=512,num_layers=1,directions=directions_,bidirectonal=bidirectional_,out=out_classes)
model

embed layer is trainable or no :---- True


EncoderRNN(
  (embedding): Embedding(3199, 300)
  (gru): GRU(300, 512, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=1024, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=128, bias=True)
  (linear3): Linear(in_features=128, out_features=6, bias=True)
  (drop): Dropout(p=0.5)
)

In [23]:
device = torch.device("cuda")
model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

#training
update_weights = 1024
loss_t = []
acc = 0
epoch_ = 15
for epoch in range(epoch_):
    running_loss = 0
    optimizer.zero_grad()
    print("epoch number :",epoch+1)
    for i,(x,y) in enumerate(total_data_):
        model.train()
        
        h = model.init_hidden(1).to(device)
        y = torch.LongTensor(y).to(device)
        inp = torch.tensor(x.T,dtype = torch.long).to(device)
        
        try:
            out = model(inp,h)
            loss = loss_function(out,y)
            loss.backward()
            loss_t.append(running_loss)
            running_loss += loss.item()
        except: pass

        if i % update_weights == update_weights - 1:    # update weights as defined	
            print('[%d, %5d] loss: %.3f' %(epoch + 1, i + 1, running_loss/update_weights ))
            running_loss = 0
            optimizer.step()
            optimizer.zero_grad()
    torch.save(model.state_dict(),"weights/embed_"+str(epoch)+".pth")
    with torch.no_grad():
        model.eval()
        num = 0
        length = 0
        for i,(x,y) in enumerate(val_):
            h = model.init_hidden(1).to(device)
            y = torch.LongTensor(y).to(device)
            inp = torch.tensor(x.T,dtype = torch.long).to(device)
            try:
                out = model(inp,h)
                out,pred = torch.max(out,1)
                if y == pred.item():
                    num = num+1
                length = length + 1
            except: pass
        accuracy = (num/length)*100
        print("accuray while evaluating is :",accuracy ,"%.")
        

epoch number : 1
[1,  1024] loss: 1.848
[1,  2048] loss: 1.613
[1,  3072] loss: 1.452
[1,  4096] loss: 1.504
[1,  5120] loss: 1.397
[1,  6144] loss: 1.406
[1,  7168] loss: 1.338
[1,  8192] loss: 1.390
[1,  9216] loss: 1.345
[1, 10240] loss: 1.333
[1, 11264] loss: 2.065
accuray while evaluating is : 21.081504702194355 %.
epoch number : 2
[2,  1024] loss: 1.376
[2,  2048] loss: 1.195
[2,  3072] loss: 1.147
[2,  4096] loss: 1.207
[2,  5120] loss: 1.206
[2,  6144] loss: 1.174
[2,  7168] loss: 1.164
[2,  8192] loss: 1.206
[2,  9216] loss: 1.169
[2, 10240] loss: 1.152
[2, 11264] loss: 1.903
accuray while evaluating is : 26.489028213166144 %.
epoch number : 3
[3,  1024] loss: 1.215
[3,  2048] loss: 1.079
[3,  3072] loss: 1.031
[3,  4096] loss: 1.053
[3,  5120] loss: 1.126
[3,  6144] loss: 1.067
[3,  7168] loss: 1.053
[3,  8192] loss: 1.121
[3,  9216] loss: 1.069
[3, 10240] loss: 1.017
[3, 11264] loss: 1.842
accuray while evaluating is : 27.11598746081505 %.
epoch number : 4
[4,  1024] loss: 1

In [25]:
#testing
for j in range(epoch_):
    try:model.load_state_dict(torch.load("weights/embed_"+str(j)+".pth"), strict = True)
    except: pass
    with torch.no_grad():
            model.eval()
            num = 0
            length = 0
            for i,(x,y) in enumerate(test_):
                h = model.init_hidden(1).to(device)
                y = torch.LongTensor(y).to(device)
                inp = torch.tensor(x.T,dtype = torch.long).to(device)
                try:
                    out = model(inp,h)
                    out,pred = torch.max(out,1)
                    if y == pred.item():
                        num = num+1
                    length = length + 1
                except: pass
            accuracy = (num/length)*100
            print("accuray while evaluating at"+str(j)+" is :",accuracy,"%.")

accuray while evaluating at0 is : 23.052464228934817 %.
accuray while evaluating at1 is : 22.89348171701113 %.
accuray while evaluating at2 is : 23.52941176470588 %.
accuray while evaluating at3 is : 23.449920508744036 %.
accuray while evaluating at4 is : 23.052464228934817 %.
accuray while evaluating at5 is : 23.84737678855326 %.
accuray while evaluating at6 is : 23.052464228934817 %.
accuray while evaluating at7 is : 22.575516693163753 %.
accuray while evaluating at8 is : 22.17806041335453 %.
accuray while evaluating at9 is : 20.906200317965023 %.
accuray while evaluating at10 is : 22.496025437201908 %.
accuray while evaluating at11 is : 21.54213036565978 %.
accuray while evaluating at12 is : 20.5087440381558 %.
accuray while evaluating at13 is : 21.303656597774246 %.
accuray while evaluating at14 is : 22.098569157392685 %.


In [None]:
total data bi 
