In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
data_path = "data/IMDB_reviews.json"

# With lines=True, we read line by line (json lines), not a single json object
try:
    df = pd.read_json(data_path,lines=True)
except:
    df = pd.read_json(data_path)

In [3]:
print(df.head(5))

        review_date   movie_id    user_id  is_spoiler  \
0  10 February 2006  tt0111161  ur1898687        True   
1  6 September 2000  tt0111161  ur0842118        True   
2     3 August 2001  tt0111161  ur1285640        True   
3  1 September 2002  tt0111161  ur1003471        True   
4       20 May 2004  tt0111161  ur0226855        True   

                                         review_text  rating  \
0  In its Oscar year, Shawshank Redemption (writt...      10   
1  The Shawshank Redemption is without a doubt on...      10   
2  I believe that this film is the best story eve...       8   
3  **Yes, there are SPOILERS here**This film has ...      10   
4  At the heart of this extraordinary movie is a ...       8   

                                  review_summary  
0  A classic piece of unforgettable film-making.  
1     Simply amazing. The best film of the 90's.  
2               The best story ever told on film  
3                     Busy dying or busy living?  
4         Great s

In [4]:
print(df['is_spoiler'].value_counts())

is_spoiler
False    422989
True     150924
Name: count, dtype: int64


In [5]:
# Drop unnecessary columns

cols_to_drop=["movie_id","user_id","review_date"]

df = df.drop(columns=cols_to_drop,errors="ignore") # with errors=ignore, it won't raise an error if columns don't exist, it continues

print("Columns Remaining:", df.columns.tolist())

Columns Remaining: ['is_spoiler', 'review_text', 'rating', 'review_summary']


In [6]:
# Handling data imbalance (Undersampling)

sample_size = 1000

spoiler_count = df[df["is_spoiler"]==True].shape[0]
# If there aren't as many spoilers as the selected sample size, adjust sample size based on max existing spoilers
if spoiler_count < sample_size:
    sample_size = spoiler_count

spoilers = df[df["is_spoiler"]==True].sample(sample_size,random_state=35)
non_spoilers = df[df["is_spoiler"]==False].sample(sample_size,random_state=35)

df_balanced = pd.concat([spoilers,non_spoilers])

In [7]:
import re

def clean_text(text):
    # 1. Convert to string
    text = str(text)
    
    # 2. Remove HTML tags (<br /> etc.) - ESSENTIAL
    text = re.sub(r"<.*?>", "", text)
    
    # 3. Remove square brackets - ESSENTIAL
    text = re.sub(r"\[.*?\]", "", text)
    
    # 4. Remove URLs (if any) - GOOD PRACTICE
    text = re.sub(r"http\S+", "", text)
    
    # NOTE: We intentionally REMOVED the punctuation deletion step!
    # BERT needs punctuation to understand context (e.g., '!', '?', ',').
    
    # 5. Remove extra whitespace and newlines
    text = re.sub(r"\s+", " ", text).strip()
    
    # 6. Convert to lowercase (Since we use 'distilbert-base-uncased', this is fine)
    return text.lower()

In [8]:
print("Texts are cleaning (BERT-friendly mode)...")

# Apply the cleaning function to the review text
# NOTE: Naming the column "cleaned_text" to match the training script expectations
df_balanced["cleaned_text"] = df_balanced["review_text"].apply(clean_text)

# Check the first few rows to ensure punctuation is preserved
print(df_balanced[["review_text", "cleaned_text"]].head())

Texts are cleaning (BERT-friendly mode)...
                                              review_text  \
373928  Okay...This is a combination of Phantom of the...   
269898  It's nice endearing to watch the lead actor to...   
414802  I wanted to see Marie Antionette ever since I ...   
371891  This is amazing! I was looking forward to it a...   
461087  The best thing I liked about this movie was it...   

                                             cleaned_text  
373928  okay...this is a combination of phantom of the...  
269898  it's nice endearing to watch the lead actor to...  
414802  i wanted to see marie antionette ever since i ...  
371891  this is amazing! i was looking forward to it a...  
461087  the best thing i liked about this movie was it...  


In [9]:
# Label Encoding: Convert True/False to 1/0
df_balanced["label"] = df_balanced["is_spoiler"].astype(int)

# Select only necessary columns to keep the file size optimized
output_df = df_balanced[["cleaned_text", "label"]] 

# Save to CSV
output_path = "../data/cleaned_data.csv"
output_df.to_csv(output_path, index=False)

print(f"\nBERT-friendly cleaned data saved as '{output_path}'.")


BERT-friendly cleaned data saved as 'data/cleaned_data.csv'.
