In [13]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')


file_path = '/content/drive/MyDrive/THESIS 47/Saved Datasets/Dataset_A_200k_balanced_english_cap5.csv'

# Load dataset
df = pd.read_csv(file_path)

# Preview
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,id,title,reviewText,scoreSentiment,originalScore,audienceScore,tomatoMeter,runtimeMinutes,genre,originalLanguage,director,reviewId,movieLangNorm
0,nekrotronic,Nekrotronic,Ghostbusters meets The Matrix in this very ent...,NEGATIVE,2/5,66.0,39.0,99.0,"Comedy, Horror, Sci-fi",English (Australia),Kiah Roache-Turner,2508351,English
1,goodnight_mommy_2022,Goodnight Mommy,Goodnight Mommy cannot avoid comparisons that ...,NEGATIVE,5/10,35.0,40.0,91.0,Mystery & thriller,English,Matt Sobel,102732302,English
2,california_split,California Split,...a distressingly erratic endeavor that never...,POSITIVE,3/4,83.0,87.0,108.0,"Comedy, Drama",English,Robert Altman,2810399,English
3,midsommar,Midsommar,"High-art horror that won't suit all tastes, or...",POSITIVE,3/4,63.0,83.0,145.0,"Horror, Mystery & thriller",English,Ari Aster,2601797,English
4,the_leather_boys_1964,The Leather Boys,something rather different and much more inter...,POSITIVE,3.5/5,79.0,77.0,108.0,Drama,English,Sidney J. Furie,1383255,English


In [14]:
import re
import pandas as pd

# Classify each score into a type
def classify_score_type(score):
    if pd.isna(score):
        return 'missing'
    score = str(score).strip().upper()

    if '/' in score:
        return 'fraction'
    elif re.match(r'^[A-F][+-]?$', score):
        return 'grade'
    elif re.match(r'^\d+(\.\d+)?$', score):
        return 'numeric'
    else:
        return 'other'

# Apply classification
df['score_type'] = df['originalScore'].apply(classify_score_type)

# Show count of each type
type_counts = df['score_type'].value_counts()
print("üîç Count of each score type:\n")
print(type_counts)

# Show all unique values for each type (not truncated)
pd.set_option('display.max_rows', None)  # ensures all values are printed

for score_type in type_counts.index:
    print(f"\nüîé All values under type: {score_type}")
    values = df[df['score_type'] == score_type]['originalScore'].dropna().unique()
    print(sorted(values))  # sorted for easier reading


üîç Count of each score type:

score_type
fraction    167674
grade        31176
numeric        652
other          498
Name: count, dtype: int64

üîé All values under type: fraction
[' 3/5', "'1.5/4'", "'2.5/4'", "'2.5/5'", "'2.75/5'", "'2/4'", "'2/5'", "'3.25/5'", "'3.5/4", "'3.5/4'", "'3.75/5'", "'3/4'", "'3/5'", "'4/4'", '****1/2', '***1/2', '-1/10', '-1/4', '.01/5', '.25/4', '.5/ 5', '.5/10', '.5/4', '.5/4.0', '.5/5', '.66/10', '.7221/10', '.96/10', '0 / 5', '0.0/5', '0.0/5.0', '0.02/5', '0.23/5', '0.25/5', '0.3/5', '0.4/5', '0.5 / 5', '0.5/10', '0.5/4', '0.5/4.0', '0.5/5', '0.5/5.0', '0.52/1', '0.6/1', '0.7/5', '0.75/5', '0.8/5', '0.81/1', '0/10', '0/3', '0/4', '0/4.0', '0/5', '1 / 5', '1.0/4', '1.0/4.0', '1.0/5', '1.0/5.0', '1.1/2', '1.1/5', '1.2/10', '1.2/5', '1.24/5', '1.25/10', '1.25/5', '1.3/5', '1.4/4', '1.4/5', '1.5 / 5', '1.5/10', '1.5/2', '1.5/3', '1.5/4', '1.5/4.0', '1.5/5', '1.5/5.0', '1.6/5', '1.62/5', '1.7/5', '1.75/4', '1.75/5', '1.8/10', '1.8/4', '1.8/5', '1.9/5', 

In [15]:
import pandas as pd
import numpy as np
import re

# -------------------------------
# STEP 1: Filter out unwanted types
# -------------------------------
# Remove 'other' and 'numeric' score types
df = df[~df['score_type'].isin(['other', 'numeric'])].copy()

# -------------------------------
# STEP 2: Clean and Convert Fractions to 10
# -------------------------------
def clean_fraction(score):
    # Keep only valid fractions like 3/5 or 2.5/4 (no stars, letters, etc.)
    if isinstance(score, str) and re.match(r'^\d+(\.\d+)?/\d+(\.\d+)?$', score.strip()):
        try:
            num, denom = map(float, score.strip().split('/'))
            return round((num / denom) * 10, 2)
        except:
            return np.nan
    return np.nan

# Apply only to fraction type
df.loc[df['score_type'] == 'fraction', 'score_10'] = df.loc[df['score_type'] == 'fraction', 'originalScore'].apply(clean_fraction)

# -------------------------------
# STEP 3: Clean and Convert Grades to 10
# -------------------------------
grade_map = {
    'A+': 10, 'A': 9, 'A-': 8.5,
    'B+': 8, 'B': 7, 'B-': 6.5,
    'C+': 6, 'C': 5, 'C-': 4.5,
    'D+': 4, 'D': 3, 'D-': 2,
    'F': 1, 'F+': 1, 'F-': 1
}

def clean_grade(score):
    s = str(score).strip().upper()
    s = s.replace('PLUS', '+').replace('MINUS', '-').replace(' ', '')

    # Normalize aliases (like 'B PLUS', 'A- ', 'a', 'f+')
    return grade_map.get(s, np.nan)

# Apply only to grade type
df.loc[df['score_type'] == 'grade', 'score_10'] = df.loc[df['score_type'] == 'grade', 'originalScore'].apply(clean_grade)

# -------------------------------
# STEP 4: Final Cleaning
# -------------------------------
# Drop rows where score_10 is still NaN (invalid or unhandled cases)
df = df.dropna(subset=['score_10'])

# -------------------------------
# STEP 5: Summary Statistics
# -------------------------------
# Count unique scores and show
print("‚úÖ Final Score Distribution (scale of 10):\n")
print(df['score_10'].value_counts().sort_index())

# Optional: show unique final scores
print("\n‚úÖ Unique score_10 values used:")
print(sorted(df['score_10'].unique()))


‚úÖ Final Score Distribution (scale of 10):

score_10
0.00      1842
0.04         1
0.20         1
0.40         1
0.46         1
0.50         7
0.60         2
0.75         1
0.80         2
0.83         1
0.90         2
1.00      1191
1.20         2
1.25       571
1.40         1
1.50        32
1.60         4
1.67        28
1.70         1
1.80         4
1.90         3
1.94         1
2.00      8164
2.10         2
2.20         4
2.22         1
2.28         1
2.30         2
2.40         7
2.48         1
2.50      5273
2.57         1
2.60        10
2.65         1
2.70         5
2.74         1
2.75         3
2.80        12
2.84         1
2.90         4
3.00      5798
3.10         9
3.11         1
3.20        10
3.21         2
3.22         1
3.24         1
3.25         1
3.26         1
3.30         9
3.33       124
3.40        19
3.45         2
3.50       118
3.55         1
3.56         1
3.60        17
3.65         1
3.70         9
3.72         1
3.75      6814
3.76         1
3.80        15
3

In [16]:
# Step 1: Remove scores greater than 10
before = df.shape[0]
df = df[df['score_10'] <= 10]
after = df.shape[0]

print(f"‚úÖ Removed {before - after} rows where score_10 > 10.")

# Step 2: Check numeric range of score_10
min_score = df['score_10'].min()
max_score = df['score_10'].max()

print(f"\n‚úÖ Remaining score_10 values are in the range: {min_score} to {max_score}")

# Step 3: Confirm only numeric values are present
is_numeric = pd.api.types.is_numeric_dtype(df['score_10'])
print(f"\n‚úÖ Column `score_10` is numeric: {is_numeric}")

# Step 4 (optional): Final value counts
print("\n‚úÖ Final cleaned score distribution:")
print(df['score_10'].value_counts().sort_index())

‚úÖ Removed 28 rows where score_10 > 10.

‚úÖ Remaining score_10 values are in the range: 0.0 to 10.0

‚úÖ Column `score_10` is numeric: True

‚úÖ Final cleaned score distribution:
score_10
0.00      1842
0.04         1
0.20         1
0.40         1
0.46         1
0.50         7
0.60         2
0.75         1
0.80         2
0.83         1
0.90         2
1.00      1191
1.20         2
1.25       571
1.40         1
1.50        32
1.60         4
1.67        28
1.70         1
1.80         4
1.90         3
1.94         1
2.00      8164
2.10         2
2.20         4
2.22         1
2.28         1
2.30         2
2.40         7
2.48         1
2.50      5273
2.57         1
2.60        10
2.65         1
2.70         5
2.74         1
2.75         3
2.80        12
2.84         1
2.90         4
3.00      5798
3.10         9
3.11         1
3.20        10
3.21         2
3.22         1
3.24         1
3.25         1
3.26         1
3.30         9
3.33       124
3.40        19
3.45         2
3.50       118


In [17]:
print(f"‚úÖ Total number of rows in the cleaned dataset: {df.shape[0]}")


‚úÖ Total number of rows in the cleaned dataset: 198059


In [18]:
# Define path and save
output_path = '/content/drive/MyDrive/THESIS 47/Saved Datasets/All_languages_200k_data_save [Cleaned].csv'
df.to_csv(output_path, index=False)

print(f"‚úÖ File saved successfully to: {output_path}")


‚úÖ File saved successfully to: /content/drive/MyDrive/THESIS 47/Saved Datasets/All_languages_200k_data_save [Cleaned].csv


In [19]:
# Load dataset
file_path = '/content/drive/MyDrive/THESIS 47/Saved Datasets/All_languages_200k_data_save [Cleaned].csv'
df = pd.read_csv(file_path)

# Show number of columns and their names
print(f"‚úÖ Total number of columns: {df.shape[1]}")
print("üìå Column names:")
print(df.columns.tolist())


‚úÖ Total number of columns: 15
üìå Column names:
['id', 'title', 'reviewText', 'scoreSentiment', 'originalScore', 'audienceScore', 'tomatoMeter', 'runtimeMinutes', 'genre', 'originalLanguage', 'director', 'reviewId', 'movieLangNorm', 'score_type', 'score_10']


In [20]:
# Step 1: Drop unwanted columns
df.drop(columns=['score_type', 'originalScore'], inplace=True)

# Step 2: Rename 'score_10' to 'originalScore'
df.rename(columns={'score_10': 'originalScore'}, inplace=True)

# Step 3: Reorder columns: insert new 'originalScore' after 'scoreSentiment'
cols = df.columns.tolist()
# Remove 'originalScore' from end if exists
cols.remove('originalScore')
# Find index of 'scoreSentiment' and insert after it
insert_at = cols.index('scoreSentiment') + 1
cols.insert(insert_at, 'originalScore')

# Reorder DataFrame
df = df[cols]

# Confirm changes
print(f"‚úÖ Final column count: {df.shape[1]}")
print("üìå Final column order:")
print(df.columns.tolist())


‚úÖ Final column count: 13
üìå Final column order:
['id', 'title', 'reviewText', 'scoreSentiment', 'originalScore', 'audienceScore', 'tomatoMeter', 'runtimeMinutes', 'genre', 'originalLanguage', 'director', 'reviewId', 'movieLangNorm']


In [21]:

# Define the path and save the file
output_path = '/content/drive/MyDrive/THESIS 47/Saved Datasets/[Original Score Normalized] All_languages_200k_data_save.csv'
df.to_csv(output_path, index=False)

print(f"‚úÖ File saved successfully to: {output_path}")

‚úÖ File saved successfully to: /content/drive/MyDrive/THESIS 47/Saved Datasets/[Original Score Normalized] All_languages_200k_data_save.csv


In [22]:
print(f"‚úÖ Total number of rows in the cleaned dataset: {df.shape[0]}")


‚úÖ Total number of rows in the cleaned dataset: 198059
