In [1]:
!pip -q install langdetect


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m33.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import os
import pandas as pd
import numpy as np
from langdetect import detect, DetectorFactory, LangDetectException

# Make langdetect deterministic
DetectorFactory.seed = 0

# Your file path
FILE_PATH = '/content/drive/MyDrive/THESIS 47/Saved Datasets/[Suitable for FS] All_languages_200k_data_save.csv'

# Output file in the SAME directory
base_dir = os.path.dirname(FILE_PATH)
out_path = os.path.join(base_dir, '[Non English reviews deleted] All_Language_Data.csv')
out_path


'/content/drive/MyDrive/THESIS 47/Saved Datasets/[Non English reviews deleted] All_Language_Data.csv'

In [4]:
# Read with options that help large/mixed-type CSVs
df = pd.read_csv(FILE_PATH, low_memory=False)
print('Loaded shape:', df.shape)
df.head(3)


Loaded shape: (198059, 10)


Unnamed: 0,title,reviewText,originalScore,audienceScore,tomatoMeter,runtimeMinutes,genre,sentiment_label,language_encoded,director_encoded
0,Nekrotronic,Ghostbusters meets The Matrix in this very ent...,4.0,6.6,3.9,99.0,"comedy, horror, sci-fi",0,10,300
1,Goodnight Mommy,Goodnight Mommy cannot avoid comparisons that ...,5.0,3.5,4.0,91.0,mystery & thriller,0,0,300
2,California Split,...a distressingly erratic endeavor that never...,7.5,8.3,8.7,108.0,"comedy, drama",1,0,40


In [5]:
import re

def clean_text(s: str) -> str:
    # Remove URLs, digits-only tokens, and trim excessive spaces
    s = re.sub(r'http\S+|www\.\S+', ' ', str(s))
    s = re.sub(r'\d+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def is_english(text: str) -> bool:
    """
    Returns True if detected language is English, else False.
    Heuristics:
      - If empty/very short/mostly non-letters → treat as non-English.
      - Catch detector errors gracefully.
    """
    if not isinstance(text, str):
        return False
    t = clean_text(text)
    # require at least a few alphabetic characters
    if len(re.findall(r'[A-Za-z]', t)) < 10:
        return False
    try:
        return detect(t) == 'en'
    except LangDetectException:
        return False


In [6]:
from tqdm.notebook import tqdm
tqdm.pandas()

if 'reviewText' not in df.columns:
    raise KeyError("Column 'reviewText' not found in the CSV. Please confirm the column name.")

before_shape = df.shape
print('Before delete:', before_shape)

# Detect English
df['__is_en__'] = df['reviewText'].progress_apply(is_english)

# Filter to only English rows
df_en = df[df['__is_en__']].drop(columns=['__is_en__']).reset_index(drop=True)

after_shape = df_en.shape
removed = before_shape[0] - after_shape[0]
print('After delete:', after_shape, '| Removed rows:', removed)


Before delete: (198059, 10)


  0%|          | 0/198059 [00:00<?, ?it/s]

After delete: (194801, 10) | Removed rows: 3258


In [7]:
# Show a small sample of kept rows (English)
display(df_en[['title', 'reviewText']].head(5))

# Show a few of the removed rows (non-English) for verification
non_en_sample = df[~df['__is_en__']][['title', 'reviewText']].head(5)
display(non_en_sample)


Unnamed: 0,title,reviewText
0,Nekrotronic,Ghostbusters meets The Matrix in this very ent...
1,Goodnight Mommy,Goodnight Mommy cannot avoid comparisons that ...
2,California Split,...a distressingly erratic endeavor that never...
3,Midsommar,"High-art horror that won't suit all tastes, or..."
4,The Leather Boys,something rather different and much more inter...


Unnamed: 0,title,reviewText
84,Mortal Kombat Annihilation,Game Over
98,The Neighbor No. Thirteen,O estreante Inoue demonstra possuir um bom sen...
117,Clockers,Sub-par Spike Lee
247,The I Inside,"Revela-se decepcionante, limitando-se a recicl..."
280,Fat Girl,Disturbing and disjointed.


In [8]:
# --- SHAPE SANITY CHECK BEFORE SAVING ---
print("Final cleaned dataset shape:", df_en.shape)

# --- SAVE TO DRIVE ---
df_en.to_csv(out_path, index=False)
print("Saved to:", out_path)


Final cleaned dataset shape: (194801, 10)
Saved to: /content/drive/MyDrive/THESIS 47/Saved Datasets/[Non English reviews deleted] All_Language_Data.csv
