# **Data Extraction and Cleaning**

In [None]:
!pip install torch==1.12.1
!pip install torchdata==0.4.1

In [None]:
from torchtext.datasets import IMDB
import pandas as pd

train_iter = IMDB(split='train')

sentences = []
labels = []

for label, line in train_iter:
    sentences.append(line)
    labels.append(label)


test_iter = IMDB(split='test')

for label, line in test_iter:
    sentences.append(line)
    labels.append(label)

df = pd.DataFrame({"review": sentences, "sentiment": [1 if each == "pos" else 0 for each in labels]})

df

In [None]:
# Descriptive statistics
df.describe()

In [None]:
# Identifying missing values

df.isnull().sum()

In [None]:
# Duplicated review - Duplicated row
# If this two values are equal that means the same review have the same labels

print(df.review.duplicated().sum(), df.duplicated().sum())

df.drop_duplicates(inplace=True)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

In [None]:
mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
           "'cause": "because", "could've": "could have", "couldn't": "could not", 
           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
           "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", 
           "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
           "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
           "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", 
           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have",
           "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
           "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
           "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
           "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
           "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", 
           "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
           "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
           "she's": "she is", "should've": "should have", "shouldn't": "should not", 
           "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is",
           "that'd": "that would", "that'd've": "that would have", "that's": "that is", 
           "there'd": "there would", "there'd've": "there would have", "there's": "there is", 
           "here's": "here is","they'd": "they would", "they'd've": "they would have", 
           "they'll": "they will", "they'll've": "they will have", "they're": "they are", 
           "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", 
           "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
           "we're": "we are", "we've": "we have", "weren't": "were not", 
           "what'll": "what will", "what'll've": "what will have","what're": "what are",  
           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", 
           "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
           "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", 
           "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
           "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
           "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
           "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
           "you're": "you are", "you've": "you have" }

In [None]:
from bs4 import BeautifulSoup
import string
import re

# Function to clean data

def clean_text(text,lemmatize = True):
    soup = BeautifulSoup(text, "html.parser") # Remove html tags
    text = soup.get_text()

    # Expanding chatwords and contracts clearing contractions
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    text = emoji_clean.sub(r'',text)
    text = re.sub(r'\.(?=\S)', '. ',text)   # Add space after full stop
    text = re.sub(r'http\S+', '', text)     # Remove urls
    text = "".join([word.lower() for word in text if word not in string.punctuation]) # Remove punctuation
    
    # Return token
    return " ".join([word for word in text.split() if word not in stop and word.isalpha()])

In [None]:
df['review'] = df['review'].apply(clean_text)

# **Data analysis** 

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Count Plot

sns.set(style="whitegrid", font_scale=1.2)
sns.countplot(x=df.sentiment, palette=['green','red'], order=[1, 0])
plt.xticks(ticks=np.arange(2), labels=['positive','negative'])
plt.title('Sentiment count for IMBD reviews')
plt.show()

In [None]:
print('Positive reviews are', (round(df['sentiment'].value_counts()[0])),'i.e.', round(df['sentiment'].value_counts()[0]/len(df)*100, 2), '% of the dataset')
print('Negative reviews are', (round(df['sentiment'].value_counts()[1])),'i.e.',round(df['sentiment'].value_counts()[1]/len(df)*100, 2), '% of the dataset')

In [None]:
from wordcloud import WordCloud, STOPWORDS

# Word cloud for positive reviews

positive_data = df[df.sentiment == 1]['review']
positive_data_string = ' '.join(positive_data)

wc = WordCloud(max_words=2000, width=1200, height=600, background_color="white").generate(positive_data_string)

plt.figure(figsize = (20,20))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Word cloud for negative reviews

negative_data = df[df.sentiment == 0]['review']
negative_data_string = ' '.join(negative_data)

wc = WordCloud(max_words=2000, width=1200, height=600, background_color="white").generate(negative_data_string)

plt.figure(figsize = (20,20))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
text_len_pos = positive_data.str.len()
text_len_neg = negative_data.str.len()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
ax1.hist(text_len_pos, color='green')
ax1.set_title('Positive Reviews')
ax1.set_xlabel('Number of Characters')
ax1.set_ylabel('Count')

ax2.hist(text_len_neg, color='red')
ax2.set_title('Negative Reviews')
ax2.set_xlabel('Number of Characters')
ax2.set_ylabel('Count')
fig.suptitle('Number of characters in texts')
plt.show()

In [None]:
text_len_pos = positive_data.str.split().map(lambda x: len(x))
text_len_neg = negative_data.str.split().map(lambda x: len(x))


fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,8))
ax1.hist(text_len_pos, color='green')
ax1.set_title('Positive Reviews')
ax1.set_xlabel('Number of Words')
ax1.set_ylabel('Count')

ax2.hist(text_len_neg, color='red')
ax2.set_title('Negative Reviews')
ax2.set_xlabel('Number of Words')
ax2.set_ylabel('Count')
fig.suptitle('Number of words in texts')
plt.show()

In [None]:
word_pos = positive_data.str.split().apply(lambda x : len(x))
word_neg = negative_data.str.split().apply(lambda x :len(x) )


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))
sns.histplot(word_pos, ax=ax1, color="green", kde=True, stat="density", linewidth=0)
ax1.set_title('Positive Reviews')
ax1.set_xlabel('Number of words per review')

sns.histplot(word_neg, ax=ax2, color="red", kde=True, stat="density", linewidth=0)
ax2.set_title('Negative Reviews')
ax2.set_xlabel('Number of words per review')
fig.suptitle('Distribution of number of words per reviews')
plt.show()



# **Predictive Modelling using LSTM** 

In [None]:
from torch.utils.data import Dataset
import torch

def prepare_sequence(rev, vocab, maxlen):
    idxs = [vocab[w] if w in vocab else vocab["UNK"] for w in rev.split()]
    idxs = [idxs[i] if i < len(idxs) else vocab["PAD"] for i in range(maxlen)]

    return torch.tensor(idxs, dtype=torch.long)


class FilmReviewDataset(Dataset):
    
    def __init__(self, df, vocab, maxlen):
        super(FilmReviewDataset, self).__init__()
        
        self.review = df.review.apply(prepare_sequence, args=(vocab, maxlen)).tolist()
        self.sentiment = df.sentiment.tolist()


    def __getitem__(self, index):
        rev = self.review[index]
        sent = self.sentiment[index]
        
        return rev, sent

    def __len__(self):
        return len(self.review)

    def get_sentiment(self):
        return self.sentiment


In [None]:
from sklearn.model_selection import train_test_split

# Splitting into train and test

SEED = 11
test_size = 0.2

train, test = train_test_split(df, test_size=test_size, random_state=SEED)

test, val = train_test_split(test, test_size=0.5, random_state=SEED)

In [None]:
# Defining Vocabulary

maxlen = 0
vocab = {"PAD": 0, "UNK": 1}
for rev in train["review"]:
    if len(rev.split()) > maxlen:
        maxlen = len(rev.split())

    for word in rev.split():
        if word not in vocab:               # word has not been assigned an index yet
            vocab[word] = len(vocab)        # Assign each word with a unique index

print(maxlen, len(vocab))

trainset = FilmReviewDataset(train, vocab, maxlen)
valset = FilmReviewDataset(val, vocab,maxlen)
testset = FilmReviewDataset(test, vocab, maxlen)

Documentation used to built this model:


1.   [nn.Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html)
2.   [nn.LSTM](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)
3.   [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html?highlight=linear#torch.nn.Linear)





In [None]:
from tqdm import tqdm
from torch import nn
import torch
import copy

class FilmModel(nn.Module):
    
    def __init__(self, maxlen, vocab_size, embedding_dim, hidden_dim):
        super(FilmModel, self).__init__()

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # From the nn.LSTM documentation: (L, N, D∗Hout), where:
        L = maxlen          # L: sequence length
                            # N: batch size
        D = 2               # D: 2 if bidirectional 1 otherwise
        Hout = hidden_dim   # Hout: hidden_dim

        # The Classification Linear layer.
        # So we have in input a matrix N x L*D*Hout with the same shape of the Linear weigth matrix
        
        self.cls = nn.Linear(L * D * Hout, 1)
        

    def forward(self, idxs):

        # Compute embeddings
        embeds = self.word_embeddings(idxs)
        lstm_out, _ = self.lstm(embeds)

        return self.cls(lstm_out.view(len(idxs), -1))


    def train_classifier(self, trainloader, valloader, epochs, criterion, optimizer, device):
        train_losses = []
        train_accs = []
        val_losses = []
        val_accs = []

        best_epoch = 0
        best_loss = np.Inf

        for ep in range(epochs):
            self.train()
            running_loss = 0.0
            acc = 0
            total = 0

            for it, (revs, labels) in enumerate(tqdm(trainloader)):

                revs = revs.to(device)
                labels = labels.to(device)

                # Forward pass
                logits = self.forward(revs)
                loss = criterion(logits.squeeze(-1), labels.float())
                running_loss += loss.item()

                # Backpropagation
                loss.backward()
                optimizer.step()

                optimizer.zero_grad()

                # Accuracy
                predictions = torch.round(torch.sigmoid(logits)).detach().cpu().numpy()
                predictions = [int(p) for p in predictions]
                acc += (predictions == labels.detach().cpu().numpy()).sum()


                total += len(labels)
            
            train_loss = running_loss/len(trainloader)
            train_losses.append(train_loss)

            epoch_acc = acc*100/total
            train_accs.append(epoch_acc)

            # Validation
            val_loss, val_acc = self.validation(valloader, criterion, device)

            val_losses.append(val_loss)
            val_accs.append(val_acc)

            print(f"\nEpoch {ep+1}")
            if val_loss < best_loss:
                print(f"\tBest validation loss improved from {round(best_loss, 3)} to {round(val_loss, 3)}\n")
                torch.save(self.state_dict(), "tuned_models.pt")

                best_loss = val_loss
                best_epoch = ep + 1

            print(f"\tTrain Loss {round(train_loss, 3)} - Train Accuracy {round(epoch_acc, 2)}%")
            print(f"\tValid Loss {round(val_loss, 3)} - Valid Accuracy {round(val_acc, 2)}%\n")

        print(f"Best model at epoch {best_epoch} saved in tuned_models.pt")
        return train_losses, val_losses, train_accs, val_accs


    def validation(self, dataloader, criterin, device):
        self.eval()

        running_loss = 0
        acc = 0
        total = 0

        for it, (revs, labels) in enumerate(tqdm(dataloader)):

            with torch.no_grad():
                revs = revs.to(device)
                labels = labels.to(device)
                
                logits = self.forward(revs)
                
                loss = criterion(logits.squeeze(-1), labels.float())
                running_loss += loss.item()

                predictions = torch.round(torch.sigmoid(logits)).detach().cpu().numpy()
                predictions = [int(p) for p in predictions]
                acc += (predictions == labels.detach().cpu().numpy()).sum()

                total += len(labels)

        val_loss = running_loss/len(dataloader)
        val_acc = acc*100/total

        return val_loss, val_acc


    def predict(self, dataloader, device):
        self.eval()

        predictions = []
        for it, (revs, labels) in enumerate(tqdm(dataloader)):

            with torch.no_grad():
                revs = revs.to(device)
                
                logits = self.forward(revs)
                
                preds = torch.sigmoid(logits)
                predictions.append(int(torch.round(preds).detach().cpu().numpy()))

        
        return predictions

In [None]:
EMB_DIM  = 32
LSTM_DIM = 64

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = FilmModel(maxlen, len(vocab), EMB_DIM, LSTM_DIM).to(device)

In [None]:
from torch import optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss().to(device)

batch_size = 64

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size)
valloader = torch.utils.data.DataLoader(testset, batch_size=1)
testloader = torch.utils.data.DataLoader(testset, batch_size=1)

epochs = 10
train_losses, val_losses, train_accs, val_accs = model.train_classifier(trainloader, valloader, epochs, criterion, optimizer, device)

In [None]:
plt.plot(range(1, epochs+1), train_losses, label="Training")
plt.plot(range(1, epochs+1), val_losses, label="Validation")
plt.xlabel("No. of Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend(loc="center right")
plt.show()

In [None]:
plt.plot(range(1, epochs+1), train_accs, label="Training")
plt.plot(range(1, epochs+1), val_accs, label="Validation")
plt.xlabel("No. of Epoch")
plt.ylabel("Accuracy %")
plt.title("Training Accuracy")
plt.legend(loc="upper left")
plt.show()

In [None]:
from sklearn.metrics import classification_report

model = FilmModel(maxlen, len(vocab), EMB_DIM, LSTM_DIM)
model.load_state_dict(torch.load("tuned_models.pt"))
model.to(device)

predictions = model.predict(testloader, device)

print()
print(classification_report(testset.sentiment, predictions))