In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
data_path = "../data/IMDB_reviews.json"

# With lines=True, we read line by line (json lines), not a single json object
try:
    df = pd.read_json(data_path,lines=True)
except:
    df = pd.read_json(data_path)

print(f"Total raw data: {len(df)}")

Total raw data: 573913


In [3]:
print(df.head(5))

        review_date   movie_id    user_id  is_spoiler  \
0  10 February 2006  tt0111161  ur1898687        True   
1  6 September 2000  tt0111161  ur0842118        True   
2     3 August 2001  tt0111161  ur1285640        True   
3  1 September 2002  tt0111161  ur1003471        True   
4       20 May 2004  tt0111161  ur0226855        True   

                                         review_text  rating  \
0  In its Oscar year, Shawshank Redemption (writt...      10   
1  The Shawshank Redemption is without a doubt on...      10   
2  I believe that this film is the best story eve...       8   
3  **Yes, there are SPOILERS here**This film has ...      10   
4  At the heart of this extraordinary movie is a ...       8   

                                  review_summary  
0  A classic piece of unforgettable film-making.  
1     Simply amazing. The best film of the 90's.  
2               The best story ever told on film  
3                     Busy dying or busy living?  
4         Great s

In [4]:
print(df['is_spoiler'].value_counts())

is_spoiler
False    422989
True     150924
Name: count, dtype: int64


In [5]:
# The 'review_summary' often contains the most critical spoilers or sentiment.
# We combine it with the main text to give the model more context.

# Fill NaN values with empty strings to avoid errors
df['review_summary'] = df['review_summary'].fillna('')
df['review_text'] = df['review_text'].fillna('')

# Create a new column: "Summary. Review Text"
df['combined_text'] = df['review_summary'] + ". " + df['review_text']

print("Feature Engineering: 'review_summary' and 'review_text' combined successfully!")

Feature Engineering: 'review_summary' and 'review_text' combined successfully!


In [6]:
# Drop unnecessary columns

cols_to_drop=["movie_id","user_id","review_date"]

df = df.drop(columns=cols_to_drop,errors="ignore") # with errors=ignore, it won't raise an error if columns don't exist, it continues

print("Columns Remaining:", df.columns.tolist())

Columns Remaining: ['is_spoiler', 'review_text', 'rating', 'review_summary', 'combined_text']


In [7]:
# Handling data imbalance (Undersampling)

sample_size = 25000

spoiler_count = df[df["is_spoiler"]==True].shape[0]
# If there aren't as many spoilers as the selected sample size, adjust sample size based on max existing spoilers
if spoiler_count < sample_size:
    sample_size = spoiler_count

spoilers = df[df["is_spoiler"]==True].sample(sample_size,random_state=35)
non_spoilers = df[df["is_spoiler"]==False].sample(sample_size,random_state=35)

# Concatenate and Shuffle the dataset
df_balanced = pd.concat([spoilers, non_spoilers])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Balanced Dataset Size: {len(df_balanced)}")

Balanced Dataset Size: 50000


In [8]:
# Dictionary of common English contractions to expand
# This helps the model understand that "won't" means "will not", preserving the negative context.
contractions = {
    "won't": "will not", "can't": "cannot", "i'm": "i am", "he's": "he is",
    "she's": "she is", "it's": "it is", "that's": "that is", "what's": "what is",
    "n't": " not", "'re": " are", "'s": " is", "'d": " would", "'ll": " will",
    "'ve": " have", "'m": " am", "idk": "i do not know", "tbh": "to be honest"
}

def clean_text(text):
    # 1. Convert to string and lowercase
    text = str(text).lower()
    
    # 2. Expand Contractions (NEW STEP)
    # Iterate through the dictionary and replace contractions with full words
    for contraction, expansion in contractions.items():
        if contraction in text:
            text = text.replace(contraction, expansion)
    
    # 3. Remove HTML tags (<br /> etc.) - ESSENTIAL
    text = re.sub(r"<.*?>", "", text)
    
    # 4. Remove square brackets - ESSENTIAL
    text = re.sub(r"\[.*?\]", "", text)
    
    # 5. Remove URLs (if any) - GOOD PRACTICE
    text = re.sub(r"http\S+", "", text)
    
    # NOTE: We intentionally KEEP punctuation! 
    # DistilBERT needs it to understand context (e.g., '!', '?', ',').
    
    # 6. Remove extra whitespace and newlines
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

In [9]:
print("Cleaning texts...")

# NOTE: Naming the column "cleaned_text" to match the training script expectations
# Apply cleaning to the new 'combined_text' column
df_balanced["cleaned_text"] = df_balanced["combined_text"].apply(clean_text)

# Reviews with fewer than 5 words are usually noise (e.g., "Good movie", "10/10")
# They confuse the model and contribute to underfitting.
df_balanced["word_count"] = df_balanced["cleaned_text"].apply(lambda x: len(str(x).split()))

#---Remove Short Reviews (Noise)---

initial_len = len(df_balanced)
# Filter out short reviews
df_balanced = df_balanced[df_balanced["word_count"] > 2]
print(f"Removed {initial_len - len(df_balanced)} short reviews (noise).")

# Remove duplicates to prevent data leakage
df_balanced = df_balanced.drop_duplicates(subset=['cleaned_text'])

print(f"Final Data Size after cleaning: {len(df_balanced)}")

# Check the first few rows to ensure punctuation is preserved
print(df_balanced[["review_text", "cleaned_text"]].head())

Cleaning texts...
Removed 0 short reviews (noise).
Final Data Size after cleaning: 49998
                                         review_text  \
0  The comments I've read here pretty much nail h...   
1  Mickey Rourke and Marisa Tomei, are absolutely...   
2  Since the Chronicles of Narnia are a series of...   
3  *****CONTAINS SPOILERS******It took me months ...   
4  What a lovely day for Mad Max Fury Road. Not. ...   

                                        cleaned_text  
0  peter lorre is finest hour!. the comments i ha...  
1  wow........wow..........wow. mickey rourke and...  
2  disappointing in the extreme!. since the chron...  
3  very intense. *****contains spoilers******it t...  
4  mad max fury road: best movie ever or a pile o...  


In [10]:
# Label Encoding: Convert True/False to 1/0
df_balanced["label"] = df_balanced["is_spoiler"].astype(int)

# Select only necessary columns to keep the file size optimized
output_df = df_balanced[["cleaned_text", "label"]] 

# Save to CSV
output_path = "../data/cleaned_data.csv"
output_df.to_csv(output_path, index=False)

print(f"Ready for training! High-quality data saved to '{output_path}'.")

Ready for training! High-quality data saved to '../data/cleaned_data.csv'.
