In [1]:
# data manipulation
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# text processing
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(stopwords.words('english'))

# pytorch
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

# sklearn
from sklearn.metrics import classification_report, confusion_matrix

# utils
import os
from tqdm import tqdm
tqdm.pandas()
from collections import Counter

# WordNet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/

In [2]:
class IMDBDataset:
    def __init__(self, dataset_path):
        self.raw_df = pd.read_csv(dataset_path)

    def do_preprocessing(self):
        self.raw_df["sentiment"] = (self.raw_df["sentiment"] == "positive").astype(int)
        self.raw_df = self.__clean_text(self.raw_df, "review")
        self.raw_df["review"] = self.raw_df["review"].apply(self.__preprocess_text)

        return self.raw_df

    def __preprocess_text(self, text):
        tokens = word_tokenize(text)
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        tokens = [t for t in tokens if t not in stopwords]

        return " ".join(tokens)

    def __clean_text(self, df, col_name):
        df[col_name] = df[col_name].apply(self.__remove_links)
        df[col_name] = df[col_name].apply(self.__remove_tags)
        df[col_name] = df[col_name].apply(self.__remove_extra_whitespace)
        df[col_name] = df[col_name].apply(self.__remove_numbers)
        df[col_name] = df[col_name].apply(self.__remove_punctuations)

        return df

    def __remove_punctuations(self, text):
        return re.sub(r"[^\w\s]", "", text)

    def __remove_links(self, text):
        return re.sub(r"http\S+|www\S+", "", text)

    def __remove_tags(self, text):
        return re.sub(r"<[^>]+>", "", text)

    def __remove_extra_whitespace(self, text):
        return re.sub(r"\s+", " ", text).strip()

    def __remove_numbers(self, text):
        return re.sub(r"\d+", "", text)

In [3]:
imdb_dataset = IMDBDataset(dataset_path="/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
preprocessed_df = imdb_dataset.do_preprocessing()
preprocessed_df.head()

Unnamed: 0,review,sentiment
0,One reviewer ha mentioned watching Oz episode ...,1
1,A wonderful little production The filming tech...,1
2,I thought wa wonderful way spend time hot summ...,1
3,Basically family little boy Jake think zombie ...,0
4,Petter Matteis Love Time Money visually stunni...,1


In [26]:
def make_dictionary(preprocessed_df):
    
    all_words = ' '.join(preprocessed_df.review.values).split()
    counter = Counter(all_words)
    vocab = sorted(counter, key=counter.get, reverse=True)

    idx_to_token = dict(enumerate(vocab, 1))
    idx_to_token[0] = "<SEP>"
    
    token_to_idx = {token: idx for idx, token in idx_to_token.items()}
    
    return idx_to_token, token_to_idx

def pad_sequence(sequence, seq_length):
    if len(sequence) >= seq_length:
        return sequence[:seq_length]
    else:
        padded_sequence = sequence + [0] * (seq_length - len(sequence))
        return padded_sequence

In [27]:
idx_to_token, token_to_idx = make_dictionary(preprocessed_df)
preprocessed_df["tokens"] = preprocessed_df["review"].apply(lambda x: [token_to_idx[token] for token in x.split()])

# display(preprocessed_df.head())
# lens = [len(tokens) for tokens in preprocessed_df["tokens"]]
# print(lens[:200])

In [28]:
seq_length = 256
preprocessed_df["tokens"] = preprocessed_df["tokens"].apply(lambda x: pad_sequence(x, seq_length))

display(preprocessed_df.head())
# lens = [len(tokens) for tokens in preprocessed_df["tokens_padded"]]
# print(lens[:200])

Unnamed: 0,review,sentiment,tokens,tokens_padded
0,One reviewer ha mentioned watching Oz episode ...,1,"[227, 1054, 8, 943, 81, 3772, 184, 439, 2971, ...","[227, 1054, 8, 943, 81, 3772, 184, 439, 2971, ..."
1,A wonderful little production The filming tech...,1,"[60, 318, 55, 246, 5, 1242, 1613, 17848, 89094...","[60, 318, 55, 246, 5, 1242, 1613, 17848, 89094..."
2,I thought wa wonderful way spend time hot summ...,1,"[1, 101, 3, 318, 27, 1009, 9, 896, 1753, 2510,...","[1, 101, 3, 318, 27, 1009, 9, 896, 1753, 2510,..."
3,Basically family little boy Jake think zombie ...,0,"[2724, 141, 55, 249, 3460, 34, 677, 4406, 593,...","[2724, 141, 55, 249, 3460, 34, 677, 4406, 593,..."
4,Petter Matteis Love Time Money visually stunni...,1,"[89098, 35004, 1123, 1977, 7469, 2188, 1279, 4...","[89098, 35004, 1123, 1977, 7469, 2188, 1279, 4..."


In [25]:
y = np.array(preprocessed_df["sentiment"].tolist())
X = np.array(preprocessed_df["tokens"].tolist())

train_frac, val_frac, test_frac = 0.7, 0.1, 0.2
train_size = int(train_frac*len(X))
val_size = int(val_frac*len(X))

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]

print("train split: ", X_train.shape, y_train.shape)
print("val split: ", X_val.shape, y_val.shape)
print("test split: ", X_test.shape, y_test.shape)

train:  (35000, 256) (35000,)
val:  (5000, 256) (5000,)
test:  (10000, 256) (10000,)
