In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
from torchtext import data

In [None]:
# Seeding
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True  

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

In [None]:
TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True, stop_words = stop_words)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)
# https://pytorch.org/text/_modules/torchtext/data/field.html

fields = [(None, None), ('text', TEXT), ('label', LABEL)]

training_data = data.TabularDataset(path = '/kaggle/input/quora-insincere-questions-classification/train.csv', format = 'csv',fields = fields,skip_header = True)

print(vars(training_data.examples[0]))




In [None]:
import random
train_data, valid_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))



In [None]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   





In [None]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True, # Required by torch.nn
    device = device)

In [None]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers):
        
        #Constructor
        super().__init__()        
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # linear layer
        self.rnn = nn.RNN(embedding_dim, 
                           hidden_dim, 
                           num_layers=num_layers, 
                           batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        # activation function
        self.act = nn.Sigmoid()




    def forward(self, text, text_lengths):
        
        h0 = torch.zeros(self.num_layers, text.size(0), self.hidden_dim).requires_grad_()
        h0 = h0.to(device)
            
        embedded = self.embedding(text)
        # print('embedded size: ', embedded.size())
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        
        packed_output, hidden = self.rnn(packed_embedded, h0.detach())
        # print('hidden size: ', hidden.size())

        dense_outputs = self.fc(hidden[-1,:,:])
        out = self.act(dense_outputs)
        return out
    


    
   


In [None]:
#define hyperparameters
vocab_size= len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 32  
output_dim = 1
num_layers = 1

#instantiate the model
model = classifier(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers)


In [None]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

In [None]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
   
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()    
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator) 

In [None]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:

        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()

            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')
import time

    
for epoch in range(N_EPOCHS):
 
    start = time.time()
    print('======= starting epoch ======= ', epoch)
    
    print('======= now tranining ======= ', epoch)
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    print('======= now evaluating =======', epoch)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    end = time.time()
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print("traninig time = {:.3f} ".format(end - start))



In [None]:
"""
test_data = data.TabularDataset(path = '/kaggle/input/quora-insincere-questions-classification/test.csv', format = 'csv',fields = fields,skip_header = True)

print(vars(test_data.examples[0])) 
"""

In [None]:
#load weights
path='/kaggle/working/saved_weights.pt' 
model.load_state_dict(torch.load(path));
model.eval();

#inference 
import spacy
nlp = spacy.load('en')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()  

In [None]:
"""
test_iterator = data.Iterator(
    test_data,
    batch_size=1)
"""

In [None]:
"""
def test(model, iterator):
    

    #deactivating dropout layers
    model.eval()
    
    num_correct = 0
     
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:

            predicted = predict(model, batch.text)
            
            num_total += 1
            if (predicted == batch.label):
                num_correct += 1
        
    print("Test Accuracy = {:.3f}% ".format(num_correct/num_total))

"""

In [None]:
#test(model,test_iterator)