# **Dataset Cleaning/Processing Steps**

Right now we only use the action, romance, and horror csv entries. Duplicates are discarded (i.e. movies with genres from more than 1 of these 3). 

In [2]:
import pandas as pd
import os

In [4]:
os.getcwd()

'C:\\Users\\rebec\\Duke\\ECE684\\FilmGenreClassification\\data\\imdb_arh'

In [6]:
os.chdir('../../')  #Change directory to project root

In [8]:
genres = ['action', 'romance', 'horror']
genre_counts = {}

for genre in genres:
    df = pd.read_csv(f'data/imdb/{genre}.csv')
    genre_counts[genre] = len(df)

for g, count in genre_counts.items():
    print(f"{g}: {count} movies")

action: 52452 movies
romance: 52617 movies
horror: 36682 movies


In [9]:
import pandas as pd

genres = ['action', 'romance', 'horror']
true_counts = {}

for genre in genres:
    df = pd.read_csv(f'data/imdb/{genre}.csv')
    # some entries in csvs may have multiple genres 
    # check if the csv genre appears in the genre column
    true_count = df['genre'].apply(lambda x: genre.lower() in str(x).lower()).sum()
    true_counts[genre] = true_count

for g, count in true_counts.items():
    print(f"{g}: {count} movies actually labeled as {g}")


action: 52404 movies actually labeled as action
romance: 46520 movies actually labeled as romance
horror: 35643 movies actually labeled as horror


In [10]:
genres = ['action','horror', 'romance']

dfs = []

for genre in genres:
    df = pd.read_csv(f'data/imdb/{genre}.csv')
    print(f"\n--- {genre}.csv original ---")
    print(df.head())
    print(f"Length: {len(df)}")

    # keep only movies where the genre column contains the csv genre
    df = df[df['genre'].apply(lambda x: genre.lower() in str(x).lower())]
    df = df[['movie_id', 'movie_name', 'description', 'genre']].dropna(subset=['description'])
    df['csv_genre'] = genre

    print(f"\n--- {genre}.csv after filtering ---")
    print(df.head())
    print(f"Length: {len(df)}")

    dfs.append(df)


--- action.csv original ---
    movie_id                         movie_name  year certificate  runtime  \
0  tt9114286     Black Panther: Wakanda Forever  2022       PG-13  161 min   
1  tt1630029           Avatar: The Way of Water  2022       PG-13  192 min   
2  tt5884796                              Plane  2023           R  107 min   
3  tt6710474  Everything Everywhere All at Once  2022           R  139 min   
4  tt5433140                             Fast X  2023         NaN      NaN   

                        genre  rating  \
0    Action, Adventure, Drama     6.9   
1  Action, Adventure, Fantasy     7.8   
2            Action, Thriller     6.5   
3   Action, Adventure, Comedy     8.0   
4      Action, Crime, Mystery     NaN   

                                         description  \
0  The people of Wakanda fight to protect their h...   
1  Jake Sully lives with his newfound family form...   
2  A pilot finds himself caught in a war zone aft...   
3  A middle-aged Chinese immigr

In [11]:
all_data = pd.concat(dfs, ignore_index=True)
print("\nCombined dataframe")
print(all_data.head())
print(f"Length: {len(all_data)}")


Combined dataframe
    movie_id                         movie_name  \
0  tt9114286     Black Panther: Wakanda Forever   
1  tt1630029           Avatar: The Way of Water   
2  tt5884796                              Plane   
3  tt6710474  Everything Everywhere All at Once   
4  tt5433140                             Fast X   

                                         description  \
0  The people of Wakanda fight to protect their h...   
1  Jake Sully lives with his newfound family form...   
2  A pilot finds himself caught in a war zone aft...   
3  A middle-aged Chinese immigrant is swept up in...   
4  Dom Toretto and his family are targeted by the...   

                        genre csv_genre  
0    Action, Adventure, Drama    action  
1  Action, Adventure, Fantasy    action  
2            Action, Thriller    action  
3   Action, Adventure, Comedy    action  
4      Action, Crime, Mystery    action  
Length: 134567


In [12]:
duplicates = all_data[all_data.duplicated(subset=['movie_id'], keep=False)]

if not duplicates.empty:
    print("\n--- Duplicates found (movies in multiple CSVs) ---")
    print(duplicates.sort_values('movie_id'))
    print(f"Number of duplicate rows: {len(duplicates)}")
else:
    print("\nNo duplicates found.")


--- Duplicates found (movies in multiple CSVs) ---
         movie_id            movie_name  \
127024  tt0003747           Cameo Kirby   
29624   tt0003747           Cameo Kirby   
112514  tt0004545    Rose of the Rancho   
22296   tt0004545    Rose of the Rancho   
13094   tt0004635         The Squaw Man   
...           ...                   ...   
2774    tt9894470                   VFW   
79579   tt9899284  Return of the Binman   
129808  tt9899284  Return of the Binman   
74639   tt9904270         Get Rid of It   
30707   tt9904270         Get Rid of It   

                                              description  \
127024  "Cameo" Kirby, so called because of his fondne...   
29624   "Cameo" Kirby, so called because of his fondne...   
112514  Esra Kincaid takes land by force, and having t...   
22296   Esra Kincaid takes land by force, and having t...   
13094   A chivalrous British officer takes the blame f...   
...                                                   ...   
2774

In [14]:
clean_data = all_data.drop_duplicates(subset=['movie_id'], keep='first').reset_index(drop=True)

print("Length after dropping duplicates:", len(clean_data))
print(clean_data.head())
print("\nGenre distribution:")
print(clean_data['csv_genre'].value_counts())

Length after dropping duplicates: 129643
    movie_id                         movie_name  \
0  tt9114286     Black Panther: Wakanda Forever   
1  tt1630029           Avatar: The Way of Water   
2  tt5884796                              Plane   
3  tt6710474  Everything Everywhere All at Once   
4  tt5433140                             Fast X   

                                         description  \
0  The people of Wakanda fight to protect their h...   
1  Jake Sully lives with his newfound family form...   
2  A pilot finds himself caught in a war zone aft...   
3  A middle-aged Chinese immigrant is swept up in...   
4  Dom Toretto and his family are targeted by the...   

                        genre csv_genre  
0    Action, Adventure, Drama    action  
1  Action, Adventure, Fantasy    action  
2            Action, Thriller    action  
3   Action, Adventure, Comedy    action  
4      Action, Crime, Mystery    action  

Genre distribution:
csv_genre
action     52404
romance    4377

## Drop values with descriptions < 100 chars (e.g. "Add a Plot")

In [22]:
# Drop entries with very short or placeholder descriptions
clean_data['description'] = clean_data['description'].astype(str).str.strip()

# Keep only rows where description length >= 20 characters
clean_data = clean_data[clean_data['description'].str.len() >= 100].reset_index(drop=True)

print("Remaining rows:", len(clean_data))

Remaining rows: 85617


In [24]:
clean_data = clean_data[['movie_id', 'movie_name', 'description', 'genre', 'csv_genre']]
clean_data = clean_data.rename(columns={'genre': 'genre_list'})
print(clean_data.head())
print("Length:", len(clean_data))

     movie_id                         movie_name  \
0   tt9114286     Black Panther: Wakanda Forever   
1   tt1630029           Avatar: The Way of Water   
2   tt5884796                              Plane   
3   tt6710474  Everything Everywhere All at Once   
4  tt10954600  Ant-Man and the Wasp: Quantumania   

                                         description  \
0  The people of Wakanda fight to protect their h...   
1  Jake Sully lives with his newfound family form...   
2  A pilot finds himself caught in a war zone aft...   
3  A middle-aged Chinese immigrant is swept up in...   
4  Scott Lang and Hope Van Dyne, along with Hank ...   

                   genre_list csv_genre  
0    Action, Adventure, Drama    action  
1  Action, Adventure, Fantasy    action  
2            Action, Thriller    action  
3   Action, Adventure, Comedy    action  
4   Action, Adventure, Comedy    action  
Length: 85617


In [26]:
#Make 70/15/15 train/val/test split

from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    clean_data,
    test_size=0.3,
    random_state=42,
    stratify=clean_data['csv_genre']
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df['csv_genre']
)

In [27]:
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

print("\nTrain genre distribution:")
print(train_df['csv_genre'].value_counts())
print("\nVal genre distribution:")
print(val_df['csv_genre'].value_counts())
print("\nTest genre distribution:")
print(test_df['csv_genre'].value_counts())

Train size: 59931
Val size: 12843
Test size: 12843

Train genre distribution:
csv_genre
action     23070
romance    20403
horror     16458
Name: count, dtype: int64

Val genre distribution:
csv_genre
action     4943
romance    4373
horror     3527
Name: count, dtype: int64

Test genre distribution:
csv_genre
action     4944
romance    4372
horror     3527
Name: count, dtype: int64


In [30]:
# Save to CSVs
out_dir = "data/imdb_arh"
os.makedirs(out_dir, exist_ok=True)

train_df.to_csv(os.path.join(out_dir, "imdb_arh_train.csv"), index=False)
val_df.to_csv(os.path.join(out_dir, "imdb_arh_val.csv"), index=False)
test_df.to_csv(os.path.join(out_dir, "imdb_arh_test.csv"), index=False)

print(f"\nSaved CSVs to {out_dir}/")


Saved CSVs to data/imdb_arh/
