In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from unidecode import unidecode

from gensim import corpora
from gensim.utils import simple_preprocess

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
df = pd.read_csv('imdb-dataset-of-50k-movie-reviews.zip')

In [3]:
df.info()
df['sentiment'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


positive    25000
negative    25000
Name: sentiment, dtype: int64

In [4]:
df['review'].str.len().describe()

count    50000.000000
mean      1309.431020
std        989.728014
min         32.000000
25%        699.000000
50%        970.000000
75%       1590.250000
max      13704.000000
Name: review, dtype: float64

In [5]:
def preprocess_df(df):
    print("Removing HTML tags")
    df['review'] = df['review'].apply(lambda t: BeautifulSoup(t, 'html.parser').get_text())
    print("Removing diacritics")
    df['review'] = df['review'].apply(unidecode)
    print('Applying Gensim simple preprocessing')
    df['review-tokenized'] = df['review'].apply(simple_preprocess)
    print('positive -> 1, negative -> 0')
    df['sentiment'] = df['sentiment'].replace({"positive": 1, "negative": 0})
    return df

In [6]:
def train_val_test_split(df):
    df = df.sample(frac=1).reset_index(drop=True)
    train = df[:17500]
    validation = df[17500:25000]
    test = df[40000:]
    return train, validation, test

In [7]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [8]:
train, validation, test = train_val_test_split(preprocess_df(df))

Removing HTML tags
Removing diacritics
Applying Gensim simple preprocessing
positive -> 1, negative -> 0


In [9]:
vocab = corpora.Dictionary(train['review-tokenized']) # like a dictionary that's easy to use both ways

In [10]:
vocab_size = len(vocab)
vocab_size

65692

In [11]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        linear_out = self.linear(hidden)
        return F.log_softmax(linear_out.view(1, 2), dim=1)

In [12]:
input_dim = vocab_size
embedding_dim = 25
hidden_dim = 256
output_dim = 2

model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)
optimizer = optim.SGD(model.parameters(), lr=1e-3)
loss_function = nn.NLLLoss()

In [13]:
epochs = 5

In [14]:
epoch_losses = []

for epoch in range(epochs):
    print("EPOCH", epoch)
    epoch_loss = 0
    for text, sentiment in zip(train.sample(50)["review-tokenized"], train.sample(50)["sentiment"]):
        model.zero_grad()
        text_in = prepare_sequence(text, vocab.token2id)
        target = torch.LongTensor([sentiment])
        prediction = model.forward(text_in.view(-1, 1)).view(1, -1)
        loss = loss_function(prediction, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    epoch_losses.append(epoch_loss)
    print(epoch_losses)

EPOCH 0
[35.10572922229767]
EPOCH 1
[35.10572922229767, 35.00071334838867]
EPOCH 2
[35.10572922229767, 35.00071334838867, 35.40304356813431]
EPOCH 3
[35.10572922229767, 35.00071334838867, 35.40304356813431, 35.342559814453125]
EPOCH 4
[35.10572922229767, 35.00071334838867, 35.40304356813431, 35.342559814453125, 34.431629061698914]
