In [None]:
%pip install contractions

import pandas as pd
import numpy as np
import json
import nest_asyncio
import asyncio
import re
import html
import contractions
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import label_binarize
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, train_test_split
from keras.optimizers import Adam
from collections import Counter



In [None]:
# read data
reviews = pd.read_json('IMDB_reviews.json', lines=True)

nltk.download('punkt_tab')
nltk.download('wordnet')

# splitting the data into training and testing sets
reviews_train, reviews_test = train_test_split(reviews, test_size=0.2, random_state=3244)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# apply nest_asyncio to enable nested event loops
nest_asyncio.apply()

# custom stop-words: removed words with negative connotations, he/she/they/them, modal verbs, intensity/polarity words, explanatory words
custom_stop_words = ['about', 'above', 'after', 'an', 'and', 'any', 'as', 'be', 'been', 'before', 'being', 'below',
                    'between', 'both', 'by', 'does', 'doing', 'down', 'during', 'each', 'few', 'from', 'further',
                    'had', 'has', 'having', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his',   "i'd", 'if',
                    "i'll", "i'm", 'in', 'into', 'is', 'isn',  'it', "it'd", "it'll", "it's", 'its', 'itself', "i've",
                    'me',  'more', 'most', 'myself',  'nor', 'now', 'of', 'off', 'on', 'once', 'only', 'or', 'other',
                    'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same',  'some', 'such', 'than', 'that',
                    "that'll",  'their', 'theirs', 'then', 'there', 'these',  'this', 'those', 'through', 'under',
                    'until', 'up', 'was', 'we', "we'd", "we'll", "we're", 'were',  "we've", 'which', 'while', 'who',
                    'whom',  "you'd", "you'll", "you're", 'yours', 'yourself', 'yourselves', "you've", "film", "movie",
                    "character", "story", "show", "time", "make", "see", "think", "even", "way", "one", "will", "much",
                    "really", "good", "bad", "well", "people", "great", "work", "watch", "look", "better", "take",
                    "love", "life", "actor", "performance", "scene", "director", "world", "feel", "first", "know",
                    "little", "still", "want", "thing", "going", "part", "end", "made", "lot", "man", "quite", "never",
                    'actually', 'maybe', 'though', 'always', 'find', 'fun']

# given that there are no null values in the dataset, we only check for duplicates
def duplicates(data):
    data.drop_duplicates(inplace=True)
    return data

reviews_train = duplicates(reviews_train)

# remove HTML characters
def r_html(text):
    return html.unescape(text)

# replace URLs with word 'URL'
def urls(text):
    return re.sub(r'https?://[A-Za-z0-9./]+', 'url', text)

# drop duplicate sentences in a text
def duplicate_sentences(text):
    sentences = text.split('.')
    sentences = list(dict.fromkeys(sentences))
    return '.'.join(sentences)

# lowercase the text
def lowercase(text):
    return text.lower()

# expand contractions
def r_contractions(text):
    return contractions.fix(text)

# remove special characters
def special_characters(text):
    return re.sub(r'[^a-zA-Z@\s]', ' ', text)

# replace 3 or more consecutive letters with 2 letters
def consecutive_letters(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

# Function to remove custom stopwords and join back the words
def custom_stopwords(text):
    words = word_tokenize(text)
    text = [word for word in words if word not in custom_stop_words]
    return " ".join(text)

# Function to lemmatize the words (13)
def lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(text)

# Define a function to pipeline the preprocessing steps with array to turn on and off the steps
def preprocess_text(text, steps):
    if 'html' in steps:
        text = r_html(text)
    if 'url' in steps:
        text = urls(text)
    if 'dupes' in steps:
        text = duplicate_sentences(text)
    if 'lower' in steps:
        text = lowercase(text)
    if 'expand' in steps:
        text = r_contractions(text)
    if 'special' in steps:
        text = special_characters(text)
    if 'replace3' in steps:
        text = consecutive_letters(text)
    if 'custom' in steps:
        text = custom_stopwords(text)
    if 'lemmatize' in steps:
        text = lemmatize(text)
    return text

# define the preprocessing steps to be used in order
steps = ['html', 'url', 'dupes', 'lower', 'expand', 'special', 'replace3', 'custom', 'lemmatize']

# apply the preprocessing steps to the data
reviews_train['review_text'] = reviews_train['review_text'].apply(lambda x: preprocess_text(x, steps))

# drop NA
reviews_train.replace({"": np.nan, None: np.nan}, inplace=True)
reviews_train.dropna(subset=['review_text'], inplace=True)  # Drop rows with NaN in 'reviews' column, possible due to removal of entire sentences that contain only stop words

# save train and test separately
reviews_train.to_json('data/processed/IMDB_reviews_train_cleaned.json', index=False)
reviews_test.to_json('data/processed/IMDB_reviews_test.json', index=False)