In [None]:
import pandas as pd


## Reading text files into dataframes to preprocess

In [None]:
dictionary = pd.read_csv(r"/content/dictionary.txt",sep='|')
dictionary.columns = ["phrase","phrase_id"]

In [None]:
datasetsentence = pd.read_csv(r"/content/datasetSentences.txt", sep="\t")

In [None]:
sentimentlabel = pd.read_csv(r"/content/sentiment_labels.txt", sep="|")

In [None]:
sentimentlabel.columns, datasetsentence.columns, dictionary.columns

(Index(['phrase ids', 'sentiment values'], dtype='object'),
 Index(['sentence_index', 'sentence'], dtype='object'),
 Index(['phrase', 'phrase_id'], dtype='object'))

## Joining dataframes appropriately to get sentiment value of each sentence

In [None]:
join_df_1 = pd.merge(datasetsentence, dictionary, how='left', left_on='sentence', right_on='phrase')

In [None]:
sentimentlabel.columns = ['phrase_id', 'sentiment_values']

In [None]:
join_df_2 = join_df_1.join(sentimentlabel,how="left", lsuffix='l')

In [None]:
df = join_df_2[["sentence",'sentiment_values']]

## Function to divide sentiment values into classes

In [None]:
def classify_sentiment(row):
  if 0<row<=0.2:
    return 1
  elif 0.2<row<=0.4:
    return 2
  elif 0.4<row<=0.6:
    return 3
  elif 0.6<row<=0.8:
    return 4
  return 5

In [None]:
df["sentiment_class"] = df["sentiment_values"].apply(func=classify_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df = df[["sentence","sentiment_class"]]

## Function to get split index

In [None]:
import math as m
def get_index(perc,len_data):
  return int(perc*len_data)

In [None]:
len_data = df.shape[0]
split = get_index(0.7,len_data)
train,test = df.loc[:split-1], df.loc[split:]
train.shape, test.shape
#test.reset_index().drop('index',axis=1)

((8298, 2), (3557, 2))

## Writing the divided dataframes into text files

In [None]:
train.to_csv("train.csv",index=False, header=False)
test.to_csv("test.csv", index=False, header=False)

## Reading train, text files to convert them into list of tuples of tweet,sentiment class

In [None]:
train_list = []
with open(r"/content/train.csv","r") as file:
  for line in file:
    train_list.append((line[:-3],int(line[-3:].strip("\n").strip(","))))

In [None]:
test_list = []
with open(r"/content/test.csv","r") as file:
  for line in file:
    test_list.append((line[:-3],int(line[-3:].strip("\n").strip(","))))

In [None]:
import random
import torch, torchtext.legacy
from torchtext.legacy import data

In [None]:
Review = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [None]:
fields = [('reviews', Review),('labels',Label)]

In [None]:
example1 = [data.Example.fromlist([train_list[i][0], train_list[i][1]], fields) for i in range(len(train_list))]
example2= [data.Example.fromlist([test_list[i][0], test_list[i][1] ], fields) for i in range(len(test_list))]

In [None]:
SEED= 1
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f26010edeb0>

In [None]:
train_set = data.Dataset(example1, fields)
test_set = data.Dataset(example2, fields)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_iterator, test_iterator = data.BucketIterator.splits((train_set,test_set),
                                                           batch_sizes=100, 
                                                           sort_key= lambda x:len(x.reviews),
                                                           sort_within_batch=True,
                                                           device=device)

TypeError: ignored

## Creating BucketIterators for train, test

In [None]:
train_iterator = data.BucketIterator(train_set, 
                                     batch_size=100, 
                                     sort_key=lambda x:len(x.reviews), 
                                     sort_within_batch=True, 
                                     device=device)
test_iterator =  data.BucketIterator(test_set, 
                                     batch_size=100, 
                                     sort_key=lambda x:len(x.reviews), 
                                     sort_within_batch=True, 
                                     device=device)

In [None]:
Review.build_vocab(train_set)
Label.build_vocab(train_set)
len(Review.vocab), len(Label.vocab)

(16865, 5)

## Creating class for the model

In [None]:
import torch.nn as nn
import torch.nn.functional as F
 
class classifierLSTM(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True,
                           bidirectional = False
                           )
        
        #GRU
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
        # Dense layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        embedded = self.dropout(embedded)
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
 
        dense_outputs = self.dropout(dense_outputs)
        
        # Final activation function softmax
        output = F.softmax(dense_outputs[0], dim=1)
            
        return output#dense_outputs[0]

## Defining hyperparamters

In [None]:
# Define hyperparameters
size_of_vocab = len(Review.vocab)
embedding_dim = 300
num_hidden_nodes = 100
num_output_nodes = 5
num_layers = 2
dropout = 0.3
lr = 1e-3
# Instantiate the model
model = classifierLSTM(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout = dropout)

In [None]:
print(model)
def count_para(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)
print("mode has {} : trainable parameters".format({count_para}))

classifierLSTM(
  (embedding): Embedding(16865, 300)
  (encoder): LSTM(300, 100, num_layers=2, batch_first=True, dropout=0.3)
  (gru): GRU(300, 100, num_layers=2, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=100, out_features=5, bias=True)
)
mode has {<function count_para at 0x7f25583e23b0>} : trainable parameters


In [None]:
import torch.optim as optim

optmizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
def binary_loss(preds,y):
  _,predictions = torch.max(preds,1)
  correct = (predictions ==y).float()
  acc = correct.sum()/len(correct)
  return acc
model = model.to(device)
criterion = criterion.to(device)

In [None]:
import torch.nn.functional as F
def train(model, iterator, optmizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()  
  for batch in iterator:
    optmizer.zero_grad()
    review, review_lengths = batch.reviews
    predictions = model(review,review_lengths).squeeze()
    loss = criterion(predictions, batch.labels)
    acc = binary_loss(predictions,batch.labels)
    loss.backward()
    optmizer.step()
    epoch_loss+=loss.item()
    epoch_acc +=acc.item()
  return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [None]:
def evaluate(model, iterator, criterion, preds_act_tup):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  with torch.no_grad():
    for batch in iterator:
      review, review_lengths  = batch.reviews
      predictions = model(review, review_lengths).squeeze()
      preds_act_tup.append((predictions,batch.labels))
      loss = criterion(predictions, batch.labels)
      acc= binary_loss(predictions, batch.labels)
      
      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss/len(iterator), epoch_acc/len(iterator)




In [None]:
EPOCH = 10
best_valid_loss = float('inf')

for epoch in range(EPOCH):
  train_loss, train_acc = train(model,train_iterator, optmizer, criterion)

  preds_acc_tup = []

  valid_loss, valid_acc = evaluate(model, test_iterator, criterion, preds_acc_tup)
  
  if valid_loss<best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict,"saved_weights.pt")
  
  print(f"Train loss {train_loss:0.3f}  | \tTrain accuracy {train_acc*100:0.2f}% ")
  print(f"Test loss {valid_loss:0.3f}  | \tTest accuracy {valid_acc*100:0.2f}%\n ")

Train loss 1.454  | 	Train accuracy 49.47% 
Test loss 1.356  | 	Test accuracy 55.10%
 
Train loss 1.423  | 	Train accuracy 51.49% 
Test loss 1.354  | 	Test accuracy 55.10%
 
Train loss 1.425  | 	Train accuracy 51.49% 
Test loss 1.354  | 	Test accuracy 55.08%
 
Train loss 1.430  | 	Train accuracy 51.51% 
Test loss 1.355  | 	Test accuracy 55.06%
 
Train loss 1.424  | 	Train accuracy 51.53% 
Test loss 1.355  | 	Test accuracy 55.06%
 
Train loss 1.423  | 	Train accuracy 51.58% 
Test loss 1.355  | 	Test accuracy 55.02%
 
Train loss 1.423  | 	Train accuracy 51.70% 
Test loss 1.355  | 	Test accuracy 55.05%
 
Train loss 1.425  | 	Train accuracy 51.81% 
Test loss 1.355  | 	Test accuracy 54.97%
 
Train loss 1.426  | 	Train accuracy 51.83% 
Test loss 1.355  | 	Test accuracy 54.94%
 
Train loss 1.421  | 	Train accuracy 51.93% 
Test loss 1.355  | 	Test accuracy 54.91%
 


In [None]:
sample = random.sample(test_list,10)


In [None]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Review.vocab.stoi, tokens)

In [None]:
#load weights and tokenizer

path='/content/saved_weights.pt'
#model.load_state_dict(torch.load(path));
model.eval();
tokenizer_file = open('./tokenizer.pkl', 'rb')
tokenizer = pickle.load(tokenizer_file)

#inference 

import spacy
nlp = spacy.load('en')

def classify_review(review):
    
    categories = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3:"Positive", 4:"Very Positive"}
    
    # tokenize the review 
    tokenized = [tok.text for tok in nlp.tokenizer(review)] 
    # convert to integer sequence using predefined tokenizer dictionary
    indexed = [tokenizer[t] for t in tokenized]        
    # compute no. of words        
    length = [len(indexed)]
    # convert to tensor                                    
    tensor = torch.LongTensor(indexed).to(device)   
    # reshape in form of batch, no. of words           
    tensor = tensor.unsqueeze(1).T  
    # convert to tensor                          
    length_tensor = torch.LongTensor(length)
    # Get the model prediction                  
    prediction = model(tensor, length_tensor)

    _, pred = torch.max(prediction, 1) 
    
    return categories[pred.item()], pred.item()

In [None]:
def classify_and_print(test_list):
  for i in test_list:
    x,y = i
    cat  = {'1': "Very Negative", '2': "Negative", '3': "Neutral", '4':"Positive", '5':"Very Positive"}
    actual_label = cat[y]
    predicted_label_str, predicted_label = classify_review(x) 
    print("sentence: ", x)
    print("actual_label: ", actual_label, "\t\tpredicted_label: ", predicted_label_str)
    print("")

In [None]:
classify_and_print(sample)

sentence:  A yawn-provoking little farm melodrama .
actual_label:  Neutral 		predicted_label:  Very Negative

sentence:  It 's likely that whatever you thought of the first production -- pro or con -- you 'll likely think of this one .
actual_label:  Neutral 		predicted_label:  Very Negative

sentence:  The threat implied in the title PokÃ©mon 4ever is terrifying -- like locusts in a horde these things will keep coming .
actual_label:  Negative 		predicted_label:  Very Negative

sentence:  "After the setup , the air leaks out of the movie , flattening its momentum with about an hour to go ."
actual_label:  Neutral 		predicted_label:  Very Negative

sentence:  "Even by dumb action-movie standards , Ballistic : Ecks vs. Sever is a dumb action movie ."
actual_label:  Neutral 		predicted_label:  Very Negative

sentence:  "Had anyone here done anything remotely intelligent , we all could have stopped watching long ago ."
actual_label:  Neutral 		predicted_label:  Very Negative

sentence:  W