In [104]:
#%pip install contractions

import pandas as pd
import numpy as np
import nest_asyncio
import asyncio
import re
import html
import contractions
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from textblob import TextBlob

In [99]:
# read data
movie_details = pd.read_json('IMDB_movie_details.json', lines=True)
reviews_details = pd.read_json('IMDB_reviews.json', lines=True)

reviews = pd.merge(reviews_details, movie_details, on = 'movie_id')

In [100]:
def parse_partial_dates(s, dayfirst=True):
    s = s.astype('string').str.strip()

    # YYYY -> YYYY-01-01
    s = s.where(~s.str.fullmatch(r'\d{4}'), s + '-01-01')

    # YYYY-MM -> YYYY-MM-01
    s = s.where(~s.str.fullmatch(r'\d{4}-(0[1-9]|1[0-2])'), s + '-01')
    
    # clean up spaces and commas
    s = (s.str.replace(r'\s+', ' ', regex=True)
           .str.replace(',', '', regex=False))

    # parse (handles things like "1 July 2000" -> "1957-04-01")
    return pd.to_datetime(s, errors='coerce', dayfirst=dayfirst)

reviews['release_date'] = parse_partial_dates(reviews['release_date'])
reviews['review_date']  = parse_partial_dates(reviews['review_date'])

# extract year, month, day into new columns
for col in ['review_date','release_date']:
    base = col.split('_')[0]     # 'review' / 'release'
    reviews[f'{base}_year']  = reviews[col].dt.year
    reviews[f'{base}_month'] = reviews[col].dt.month
    reviews[f'{base}_day']   = reviews[col].dt.day

# inspect anything still missing
print("Unparsed release_date:", reviews['release_date'].isna().sum())
print(reviews[reviews['release_date'].isna()][['release_date']].head())


  return pd.to_datetime(s, errors='coerce', dayfirst=dayfirst)


Unparsed release_date: 0
Empty DataFrame
Columns: [release_date]
Index: []


In [101]:
# extract year, month, day into new columns
reviews['review_year']  = reviews['review_date'].dt.year
reviews['review_month'] = reviews['review_date'].dt.month
reviews['review_day']   = reviews['review_date'].dt.day

reviews['release_year']  = reviews['release_date'].dt.year
reviews['release_month'] = reviews['release_date'].dt.month
reviews['release_day']   = reviews['release_date'].dt.day


In [102]:
# normalize duration to minutes
reviews['duration'] = (
    reviews['duration'].str.extract(r'(\d+)h\s*(\d+)?')
    .fillna(0)
    .astype(int)
    .apply(lambda x: x[0]*60 + x[1], axis=1)
)

In [103]:
print(reviews.isna().any(axis=1).sum())


0


In [105]:
reviews["polarity"] = reviews["review_text"].apply(lambda s: TextBlob(s).sentiment.polarity)
reviews["subjectivity"] = reviews["review_text"].apply(lambda s: TextBlob(s).sentiment.subjectivity)

In [106]:
# nltk.download('punkt_tab')
# nltk.download('wordnet')

# splitting the data into training and testing sets
reviews_train, reviews_test = train_test_split(reviews, test_size=0.2, random_state=3244)

In [107]:
# apply nest_asyncio to enable nested event loops
nest_asyncio.apply()

# custom stop-words: removed words with negative connotations, he/she/they/them, modal verbs, intensity/polarity words, explanatory words
custom_stop_words = ['about', 'above', 'after', 'an', 'and', 'any', 'as', 'be', 'been', 'before', 'being', 'below',
                    'between', 'both', 'by', 'does', 'doing', 'down', 'during', 'each', 'few', 'from', 'further',
                    'had', 'has', 'having', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his',   "i'd", 'if',
                    "i'll", "i'm", 'in', 'into', 'is', 'isn',  'it', "it'd", "it'll", "it's", 'its', 'itself', "i've",
                    'me',  'more', 'most', 'myself',  'nor', 'now', 'of', 'off', 'on', 'once', 'only', 'or', 'other',
                    'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same',  'some', 'such', 'than', 'that',
                    "that'll",  'their', 'theirs', 'then', 'there', 'these',  'this', 'those', 'through', 'under',
                    'until', 'up', 'was', 'we', "we'd", "we'll", "we're", 'were',  "we've", 'which', 'while', 'who',
                    'whom',  "you'd", "you'll", "you're", 'yours', 'yourself', 'yourselves', "you've", "film", "movie",
                    "character", "story", "show", "time", "make", "see", "think", "even", "way", "one", "will", "much",
                    "really", "good", "bad", "well", "people", "great", "work", "watch", "look", "better", "take",
                    "love", "life", "actor", "performance", "scene", "director", "world", "feel", "first", "know",
                    "little", "still", "want", "thing", "going", "part", "end", "made", "lot", "man", "quite", "never",
                    'actually', 'maybe', 'though', 'always', 'find', 'fun']

# given that there are no null values in the dataset, we only check for duplicates
def duplicates(data):
    cols = [c for c in data.columns if c != 'genre']
    data.drop_duplicates(subset=cols, inplace=True)
    return data

reviews_train = duplicates(reviews_train)

# remove HTML characters
def r_html(text):
    return html.unescape(text)

# replace URLs with word 'URL'
def urls(text):
    return re.sub(r'https?://[A-Za-z0-9./]+', 'url', text)

# drop duplicate sentences in a text
def duplicate_sentences(text):
    sentences = text.split('.')
    sentences = list(dict.fromkeys(sentences))
    return '.'.join(sentences)

# lowercase the text
def lowercase(text):
    return text.lower()

# expand contractions
def r_contractions(text):
    return contractions.fix(text)

# remove special characters
def special_characters(text):
    return re.sub(r'[^a-zA-Z@\s]', ' ', text)

# replace 3 or more consecutive letters with 2 letters
def consecutive_letters(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

# Function to remove custom stopwords and join back the words
def custom_stopwords(text):
    words = word_tokenize(text)
    text = [word for word in words if word not in custom_stop_words]
    return " ".join(text)

# Function to lemmatize the words (13)
def lemmatize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(text)

# Define a function to pipeline the preprocessing steps with array to turn on and off the steps
def preprocess_text(text, steps):
    if 'html' in steps:
        text = r_html(text)
    if 'url' in steps:
        text = urls(text)
    if 'dupes' in steps:
        text = duplicate_sentences(text)
    if 'lower' in steps:
        text = lowercase(text)
    if 'expand' in steps:
        text = r_contractions(text)
    if 'special' in steps:
        text = special_characters(text)
    if 'replace3' in steps:
        text = consecutive_letters(text)
    if 'custom' in steps:
        text = custom_stopwords(text)
    if 'lemmatize' in steps:
        text = lemmatize(text)
    return text

# define the preprocessing steps to be used in order
steps = ['html', 'url', 'dupes', 'lower', 'expand', 'special', 'replace3', 'custom', 'lemmatize']

# apply the preprocessing steps to the data
reviews_train['review_text'] = reviews_train['review_text'].apply(lambda x: preprocess_text(x, steps))

# drop NA
reviews_train.replace({"": np.nan, None: np.nan}, inplace=True)
reviews_train.dropna(subset=['review_text'], inplace=True)  # Drop rows with NaN in 'reviews' column, possible due to removal of entire sentences that contain only stop words

In [108]:
# save train and test separately
reviews_train.to_json('IMDB_reviews_train_cleaned.json', index=False)
reviews_test.to_json('IMDB_reviews_test.json', index=False)

In [112]:
reviews_train[reviews_train[['review_year','review_month','review_day',
                   'release_year','release_month','release_day']].isna().any(axis=1)]

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating_x,review_summary,plot_summary,duration,genre,...,release_date,plot_synopsis,review_year,review_month,review_day,release_year,release_month,release_day,polarity,subjectivity
