In [35]:
import pandas as pd
import numpy as np
import re
import string

In [36]:
file_path = "data/IMDB_reviews.json"

# lines=True ile satır satır veri elde ederiz(json lines), tek bir json objesi değil
try:
    df = pd.read_json(data_path,lines=True)
except:
    df = pd.read_json(data_path)

In [37]:
print(df.head(5))

        review_date   movie_id    user_id  is_spoiler  \
0  10 February 2006  tt0111161  ur1898687        True   
1  6 September 2000  tt0111161  ur0842118        True   
2     3 August 2001  tt0111161  ur1285640        True   
3  1 September 2002  tt0111161  ur1003471        True   
4       20 May 2004  tt0111161  ur0226855        True   

                                         review_text  rating  \
0  In its Oscar year, Shawshank Redemption (writt...      10   
1  The Shawshank Redemption is without a doubt on...      10   
2  I believe that this film is the best story eve...       8   
3  **Yes, there are SPOILERS here**This film has ...      10   
4  At the heart of this extraordinary movie is a ...       8   

                                  review_summary  
0  A classic piece of unforgettable film-making.  
1     Simply amazing. The best film of the 90's.  
2               The best story ever told on film  
3                     Busy dying or busy living?  
4         Great s

In [38]:
print(df['is_spoiler'].value_counts())

is_spoiler
False    422989
True     150924
Name: count, dtype: int64


In [39]:
#Gereksiz columnları at

cols_to_drop=["movie_id","user_id","review_date"]

df = df.drop(columns=cols_to_drop,errors="ignore") # errors=ignore ile columnlar yoksa hata vermez, devam eder

print("Columns Remaining:", df.columns.tolist())

Columns Remaining: ['is_spoiler', 'review_text', 'rating', 'review_summary']


In [40]:
# Veri dengesizliğini çözme (Undersampling)

sample_size = 1000

spoiler_count = df[df["is_spoiler"]==True].shape[0]
# Seçtiğimiz sample kadar spoiler yoksa, var olan maks spoiler sayısına göre sample size ayarla
if spoiler_count < sample_size:
    sample_size = spoiler_count

spoilers = df[df["is_spoiler"]==True].sample(sample_size,random_state=35)
non_spoilers = df[df["is_spoiler"]==False].sample(sample_size,random_state=35)

df_balanced = pd.concat([spoilers,non_spoilers])


In [41]:
# Text cleaning with regex 

# Clean the reviews_text column

def clean_text(text):
    text = str(text).lower() 
    # < > bulunan her şeyi sil, (htmlden gelen <br> ler vs gitsin diye) 
    text = re.sub(r"<.*?>","",text)
    # Köşeli parantezleri sil (alıntılar için)
    text = re.sub(r"\[.*?\]","",text)

    text = text.translate(str.maketrans('', '', string.punctuation)) # Noktalama işaretlerini sil

    text = text.replace("\n", " ")

    text = re.sub(" +", " " , text) # Fazla boşlukları sil

    return text

print("Texts are cleaning...")
df_balanced["cleaned_review_text"] = df_balanced["review_text"].apply(clean_text)

#Label oluşturma (True False u 0 1 e çevir)
df_balanced["label"] = df_balanced["is_spoiler"].astype(int)

# Kaydetme

print(df_balanced.head())

output_path = "data/cleaned_data.csv"
df_balanced.to_csv(output_path, index=False)
print(f"\nCleaned data (including rating and summary) was saved as '{output_path}'.")

    
    



Texts are cleaning...
        is_spoiler                                        review_text  rating  \
373928        True  Okay...This is a combination of Phantom of the...       9   
269898        True  It's nice endearing to watch the lead actor to...       6   
414802        True  I wanted to see Marie Antionette ever since I ...       2   
371891        True  This is amazing! I was looking forward to it a...      10   
461087        True  The best thing I liked about this movie was it...       8   

                             review_summary  \
373928                                 Wow!   
269898             Nice try for a young guy   
414802                 What a waste of time   
371891                           Brilliant!   
461087  I got mixed feelings About this one   

                                      cleaned_review_text  label  
373928  okaythis is a combination of phantom of the op...      1  
269898  its nice endearing to watch the lead actor to ...      1  
414802 