## Sentence Level Author identification using RNN

In [None]:
#@markdown 
from IPython.display import HTML

HTML("""<video width="320" height="240" controls>
  <source src="https://cdn.talentsprint.com/talentsprint/archives/sc/aiml/aiml_2018_b7_hyd/preview_videos/sentence_level_author_identification_using_rnn.mp4">
</video>
""")

In [1]:
! wget https://cdn.talentsprint.com/aiml/Experiment_related_data/week12/Exp2/test.csv
! wget https://cdn.talentsprint.com/aiml/Experiment_related_data/week12/Exp2/train.csv
! wget https://cdn.talentsprint.com/aiml/Experiment_related_data/week12/Exp2/val.csv
# ! wget https://cdn.talentsprint.com/aiml/CaseStudies/Sentence_level_rnn_trained_0.66.pt

--2022-06-03 05:05:22--  https://cdn.talentsprint.com/aiml/Experiment_related_data/week12/Exp2/test.csv
Resolving cdn.talentsprint.com (cdn.talentsprint.com)... 172.105.34.236
Connecting to cdn.talentsprint.com (cdn.talentsprint.com)|172.105.34.236|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://cdn.ap-south-1.linodeobjects.com/public_html/aiml/Experiment_related_data/week12/Exp2/test.csv [following]
--2022-06-03 05:05:23--  https://cdn.ap-south-1.linodeobjects.com/public_html/aiml/Experiment_related_data/week12/Exp2/test.csv
Resolving cdn.ap-south-1.linodeobjects.com (cdn.ap-south-1.linodeobjects.com)... 139.162.5.246, 172.104.36.102, 103.3.61.236, ...
Connecting to cdn.ap-south-1.linodeobjects.com (cdn.ap-south-1.linodeobjects.com)|139.162.5.246|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1359634 (1.3M) [text/csv]
Saving to: ‘test.csv’


2022-06-03 05:05:25 (1.02 MB/s) - ‘test.csv’ saved [1359634/1359634

In [2]:
! ls

sample_data  Sentence_level_rnn_trained_0.66.pt  test.csv  train.csv  val.csv


###Importing required packages

In [3]:
import re
import gensim
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import warnings
warnings.filterwarnings('ignore')

### Load the train data

In [4]:
train_data = pd.read_csv('train.csv', encoding='latin1')

train_data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12999 entries, 0 to 12998
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      12999 non-null  object
 1   text    12999 non-null  object
 2   author  12999 non-null  object
dtypes: object(3)
memory usage: 304.8+ KB


### Assign numbers to the labels using dictionary data structure


In [None]:
label = {'EAP':0, 'HPL':1, 'MWS':2}

### Collect the sentences from the train data

In [6]:
sentences = train_data[['text','author']].values

### Load the test data

In [7]:
test_data = pd.read_csv('test.csv', encoding='latin1')

test_data.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


### Collect the sentences from the test data

In [8]:
test_sentences = test_data[['text']].values.flatten()

### Define the class for stemming / preprocessing sentences

In [9]:
class MySentences(object): # to stem the sentences - nothing but preprocessed sentences
    def __init__(self, fnamelist):
        self.fnamelist = fnamelist
        self.vocabulary = set([])  # Creating a set of vocabulary

    def __iter__(self):
        for fname in self.fnamelist:
            self.data = pd.read_csv(fname, encoding='latin1')
            self.sentences = self.data.text.values
            for line in self.sentences:
                words = re.findall(r'(\b[A-Za-z][a-z]{2,15}\b)', line)
                
                # Stemming a word.
                words = [ word.lower() for word in words]
                for word in words:
                    self.vocabulary.add(word)
                yield words

In [10]:
sentences = MySentences(['train.csv', 'val.csv','test.csv']) # a memory-friendly iterator

In [13]:
for i in sentences:
    print(i)
    break

['this', 'process', 'however', 'afforded', 'means', 'ascertaining', 'the', 'dimensions', 'dungeon', 'might', 'make', 'its', 'circuit', 'and', 'return', 'the', 'point', 'whence', 'set', 'out', 'without', 'being', 'aware', 'the', 'fact', 'perfectly', 'uniform', 'seemed', 'the', 'wall']


### Use gensims.model.Word2Vec to get vectors for the sentences and save the model as a .bin file

In [14]:
model = gensim.models.Word2Vec(sentences, min_count=1)
model.save("AuthID2Vec.bin")

### Count the corpus

In [15]:
model.corpus_count

27971

### Import required torch packages

In [16]:
import os
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms

### Define custom dataset loader

In [17]:
class CustomDataset(torch.utils.data.Dataset):    ### custom data loader
    
    def __init__(self, data_file_path,  train=True):
        self.data_file_path = data_file_path
        self.train = train
        self.data = pd.read_csv(data_file_path, encoding='latin1')
        self.ids = self.data.id.values
        self.sentences = self.data.text.values
        if self.train:
            self.label_dict = {'EAP':0, 'HPL':1, 'MWS':2}
            self.labels = [self.label_dict[i] for i in self.data.author.values]
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = self.sentences[index]
        if self.train:
            return self.sentences[index], self.labels[index]
        else:
            return self.sentences[index], self.ids[index]

### Set the batch size

In [18]:
batch_size = 16

### Use the custom dataset loader to load the .csv files for train, val and test data into batches

In [19]:
# You can then use the prebuilt data loader. 
train_set = CustomDataset("train.csv", train=True)
val_set = CustomDataset("val.csv", train=True)
test_set = CustomDataset("test.csv", train=False)

In [20]:
trainloader = torch.utils.data.DataLoader(train_set,batch_size=batch_size, shuffle = True)
valloader = torch.utils.data.DataLoader(val_set,batch_size=batch_size, shuffle = True)
testloader = torch.utils.data.DataLoader(test_set,batch_size=batch_size, shuffle = False)

In [21]:
for X,y in trainloader:
    print(X)
    print(y.size())
    break

('The effect produced by the firing of a cannon is that of simple vibration.', 'Chapter "Such was the history of my beloved cottagers.', 'She and my father had been playmates from infancy: Diana, even in her childhood had been a favourite with his mother; this partiality encreased with the years of this beautiful and lively girl and thus during his school college vacations they were perpetually together.', 'She perished on the scaffold as a murderess From the tortures of my own heart, I turned to contemplate the deep and voiceless grief of my Elizabeth.', 'I was astonished, too, at the vast extent of his reading; and, above all, I felt my soul enkindled within me by the wild fervor, and the vivid freshness of his imagination.', '"You are in the wrong," replied the fiend; "and instead of threatening, I am content to reason with you.', 'At length there broke in upon my dreams a cry as of horror and dismay; and thereunto, after a pause, succeeded the sound of troubled voices, intermingled

### Load the saved gensim model which contains vectors for the sentences 

In [22]:
model_load = gensim.models.Word2Vec.load('AuthID2Vec.bin')

### Set the deivce to CUDA

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Creating the recurrent neural network

In [24]:
### Creating recurrent neural network
class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.gru = nn.GRU(input_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax()
    
    def forward(self, input):
        outputs = []
        for sentence in input:
            hidden= self.init_hidden()
            word_embeddings = self.get_embedding(sentence)
            for word_embedding in word_embeddings:
                output, hidden = self.gru(word_embedding.unsqueeze(0).unsqueeze(0), hidden)
            try:
                #print(output.size())
                outputs.append(output)
            except:
                outputs.append(torch.rand(1,1,20))
                continue
        outputs = torch.cat(outputs)
        outputs = self.softmax(self.decoder(outputs))
        return outputs

    def init_hidden(self):
        return torch.zeros(self.n_layers, 1, self.hidden_size)
    
    def get_embedding(self, sentence):
        #print(len(sentence))
        #sentence_wt = word_tokenize(sentence.lower())
        words = re.findall(r'(\b[A-Za-z][a-z]{2,15}\b)', sentence)
        words = [ word.lower() for word in words]
        embedding = []
        for word in words:
            #print(word)
            embedding.append(torch.tensor(model_load[word]))
        #print(embedding[0].size())
        return embedding

### Implement the RNN by setting up the required parameters

In [25]:
rnn = RNN(100,20,3,n_layers=1) #Set the denfined RNN model with 100 input layers, 20 hidden layers and 3 output layers

lr = 0.001 # learning rate
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr) # Set the optimizer 

criterion = nn.CrossEntropyLoss() ## Loss function

### Train and test RNN model

### This will take quite a lot of time

In [26]:
losses  = []
val_accuracy = 0
for j in range(5): # 100
    i = 0
    correct_train = 0
    for X,y in trainloader:
        #print(index, end='\r')
        output = rnn(X)
        optimizer.zero_grad()
        loss = criterion(output.squeeze(1),y)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        _,predicted = torch.max(output.squeeze(1).data, 1)
        #print(predicted)
        correct_train += predicted.eq(y.data).cpu().sum().item()
        i=i+1
    print('Epoch: {}, Train Accuracy: {}, Average Loss: {}'.format(j, correct_train/(i*16), sum(losses)/len(losses)))
    correct_val = 0 
    for X,y in valloader:
        #print(index, end='\r')
        output = rnn(X)
        _,predicted = torch.max(output.squeeze(1).data, 1)
        correct_val += predicted.eq(y.data).cpu().sum().item()
    if val_accuracy<correct_val/6580:
        val_accuracy = correct_val/6580
        torch.save(rnn.state_dict(), 'Sentence_level_rnn_trained_{:.2f}.pt'.format(val_accuracy))
    print('Epoch: {}, Validation Accuracy: {}'.format(j, correct_val/6580))

Epoch: 0, Train Accuracy: 0.48547047970479706, Average Loss: 1.0711064479447818
Epoch: 0, Validation Accuracy: 0.536322188449848
Epoch: 1, Train Accuracy: 0.5369003690036901, Average Loss: 1.0603550898706195
Epoch: 1, Validation Accuracy: 0.5419452887537994
Epoch: 2, Train Accuracy: 0.547509225092251, Average Loss: 1.0549809044137652
Epoch: 2, Validation Accuracy: 0.5518237082066869
Epoch: 3, Train Accuracy: 0.5654212792127922, Average Loss: 1.0511090200472288
Epoch: 3, Validation Accuracy: 0.551063829787234
Epoch: 4, Train Accuracy: 0.5621924969249692, Average Loss: 1.0482614173953677
Epoch: 4, Validation Accuracy: 0.5349544072948328
