# Movie Genre Predictor

In [1]:
#import libraries 
import kagglehub # data importing 
import pandas as pd # data preprocessing 
import os 
from sklearn.model_selection import train_test_split

import contractions 
import nltk # tokenizing 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords') 
nltk.download('punkt_tab')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shery\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shery\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shery\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Download kaggle data
path = kagglehub.dataset_download("guru001/movie-genre-prediction")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\shery\.cache\kagglehub\datasets\guru001\movie-genre-prediction\versions\1


In [3]:
# Read the dataset, drop the id column 
df = pd.read_csv('./data/train.csv')
df.drop(columns="id", inplace=True)

In [4]:
# Remove punctuation 
def remove_punctuation(text): 
    punctuation =  ",./;[]-=\\~!@#$%^&*()_+{}|\":?><"
    cleaned = "".join([c for c in text if c not in punctuation])
    return cleaned 

df["synopsis"] = df["synopsis"].apply(lambda x: remove_punctuation(x))
df.head(10)

Unnamed: 0,movie_name,synopsis,genre
0,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,Entity Project,A director and her friends renting a haunted h...,horror
2,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,Apat na anino,Buy Day Four Men Widely Apart in Life By Ni...,action
5,Le démon dans l'île,On an island things start going crazy as the i...,horror
6,Candid,A video voyeur stalks women in the city with a...,horror
7,Hired,Twin brothers separated at birth and worlds ap...,crime
8,Miruthan,A traffic police officer teams up with his fri...,adventure
9,Until You See Me,A legendary tale unravels,mystery


In [5]:
#tokenize text & remove contractions 
def tokenize(text):
    text = contractions.fix(text)
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    return tokens

df["synopsis"] = df["synopsis"].apply(tokenize)

df["synopsis"].head(10)


0    [young, scriptwriter, starts, bringing, valuab...
1    [director, friends, renting, haunted, house, c...
2    [educational, video, families, family, therapi...
3    [scientists, working, austrian, alps, discover...
4    [buy, day, four, men, widely, apart, life, nig...
5    [island, things, start, going, crazy, islands,...
6    [video, voyeur, stalks, women, city, digital, ...
7    [twin, brothers, separated, birth, worlds, apa...
8    [traffic, police, officer, teams, friend, doct...
9                          [legendary, tale, unravels]
Name: synopsis, dtype: object

In [6]:
#lemmatize: finds true base word 
def lemmatize(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text 

df["synopsis"] = df["synopsis"].apply(lambda x : lemmatize(x))
df.head(10)

Unnamed: 0,movie_name,synopsis,genre
0,Super Me,"[young, scriptwriter, start, bringing, valuabl...",fantasy
1,Entity Project,"[director, friend, renting, haunted, house, ca...",horror
2,Behavioral Family Therapy for Serious Psychiat...,"[educational, video, family, family, therapist...",family
3,Blood Glacier,"[scientist, working, austrian, alp, discover, ...",scifi
4,Apat na anino,"[buy, day, four, men, widely, apart, life, nig...",action
5,Le démon dans l'île,"[island, thing, start, going, crazy, island, h...",horror
6,Candid,"[video, voyeur, stalk, woman, city, digital, c...",horror
7,Hired,"[twin, brother, separated, birth, world, apart...",crime
8,Miruthan,"[traffic, police, officer, team, friend, docto...",adventure
9,Until You See Me,"[legendary, tale, unravels]",mystery


In [None]:
#Create new csv of cleaned dataset
df["cleaned_synopsis"] = df["synopsis"].apply(lambda x: " ".join(x))

#Create new clean_data folder if missing 
folder = "../clean_data"
os.makedirs(folder, exist_ok=True)

#Save clean data to new folder
df[["movie_name", "cleaned_synopsis", "genre"]].to_csv("clean_data/cleaned_movies.csv", index=False)


In [None]:
#Split training data into train & validation 
X = df["cleaned_synopsis"]
y = df["genre"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Save train/val separately
train_df = pd.DataFrame({"cleaned_synopsis": X_train, "genre": y_train})
train_df.to_csv("../clean_data/train.csv", index=False)

val_df = pd.DataFrame({"cleaned_synopsis": X_val, "genre": y_val})
val_df.to_csv("../clean_data/val.csv", index=False) 