In [81]:
# data manipulation
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# text processing
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(stopwords.words('english'))

# pytorch
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

# sklearn
from sklearn.metrics import classification_report, confusion_matrix

# utils
import os
from tqdm import tqdm
tqdm.pandas()
from collections import Counter

# WordNet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/

In [None]:
class IMDBDataset:
    def __init__(self, dataset_path):
        self.raw_df = pd.read_csv(dataset_path)

    def do_preprocessing(self):
        self.raw_df["sentiment"] = (self.raw_df["sentiment"] == "positive").astype(int)
        self.raw_df = self.__clean_text(self.raw_df, "review")
        self.raw_df["review"] = self.raw_df["review"].apply(self.__preprocess_text)

        return self.raw_df

    def __preprocess_text(self, text):
        tokens = word_tokenize(text)
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        tokens = [t for t in tokens if t not in stopwords]

        return " ".join(tokens)

    def __clean_text(self, df, col_name):
        df[col_name] = df[col_name].apply(self.__remove_links)
        df[col_name] = df[col_name].apply(self.__remove_tags)
        df[col_name] = df[col_name].apply(self.__remove_extra_whitespace)
        df[col_name] = df[col_name].apply(self.__remove_numbers)
        df[col_name] = df[col_name].apply(self.__remove_punctuations)

        return df

    def __remove_punctuations(self, text):
        return re.sub(r"[^\w\s]", "", text)

    def __remove_links(self, text):
        return re.sub(r"http\S+|www\S+", "", text)

    def __remove_tags(self, text):
        return re.sub(r"<[^>]+>", "", text)

    def __remove_extra_whitespace(self, text):
        return re.sub(r"\s+", " ", text).strip()

    def __remove_numbers(self, text):
        return re.sub(r"\d+", "", text)

In [82]:
imdb_dataset = IMDBDataset(dataset_path="/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
preprocessed_df = imdb_dataset.do_preprocessing()
preprocessed_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
