# Movie Genre Predictor

In [1]:
#import libraries 
import pandas as pd # data preprocessing 
import numpy as np # linear algebra 
import kagglehub # data importing 

import nltk # tokenizing 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import contractions 

nltk.download('stopwords') 
nltk.download('punkt_tab')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

from sklearn.feature_extraction.text import TfidfVectorizer #Vectorizer
from sklearn.model_selection import train_test_split #to get validation dataset

from sklearn.linear_model import LogisticRegression #training
from sklearn.metrics import classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shery\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shery\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shery\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Download latest version
path = kagglehub.dataset_download("guru001/movie-genre-prediction")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\shery\.cache\kagglehub\datasets\guru001\movie-genre-prediction\versions\1


In [3]:
# Read the dataset, drop the id column 
df = pd.read_csv('./data/train.csv')
df.drop(columns="id")

Unnamed: 0,movie_name,synopsis,genre
0,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,Entity Project,A director and her friends renting a haunted h...,horror
2,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action
...,...,...,...
53995,Untitled Monkeyshine/Ugly Brother Project,Two loser pest control buddies find themselves...,scifi
53996,The Great Swindle,A seductive woman gets involved in relationshi...,thriller
53997,When Men Get Pregnant,"Duyen, a wedding dress staff, who decides to c...",romance
53998,Suttu Pidikka Utharavu,The people of a crowded colony in Coimbatore c...,action


In [4]:
# Remove punctuation 
def remove_punctuation(text): 
    punctuation =  ",./;[]-=\\~!@#$%^&*()_+{}|\":?><"
    cleaned = "".join([c for c in text if c not in punctuation])
    return cleaned 

df["synopsis"] = df["synopsis"].apply(lambda x: remove_punctuation(x))
df.head(10)

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day Four Men Widely Apart in Life By Ni...,action
5,46636,Le démon dans l'île,On an island things start going crazy as the i...,horror
6,53777,Candid,A video voyeur stalks women in the city with a...,horror
7,24872,Hired,Twin brothers separated at birth and worlds ap...,crime
8,17224,Miruthan,A traffic police officer teams up with his fri...,adventure
9,54900,Until You See Me,A legendary tale unravels,mystery


In [5]:
#tokenize text & remove contractions 
def tokenize(text):
    text = contractions.fix(text)
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    return tokens

df["synopsis"] = df["synopsis"].apply(tokenize)

df["synopsis"].head(10)


0    [young, scriptwriter, starts, bringing, valuab...
1    [director, friends, renting, haunted, house, c...
2    [educational, video, families, family, therapi...
3    [scientists, working, austrian, alps, discover...
4    [buy, day, four, men, widely, apart, life, nig...
5    [island, things, start, going, crazy, islands,...
6    [video, voyeur, stalks, women, city, digital, ...
7    [twin, brothers, separated, birth, worlds, apa...
8    [traffic, police, officer, teams, friend, doct...
9                          [legendary, tale, unravels]
Name: synopsis, dtype: object

In [6]:
#lemmatize -> finds true base word 
def lemmatize(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text 

df["synopsis"] = df["synopsis"].apply(lambda x : lemmatize(x))
df.head(10)

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,"[young, scriptwriter, start, bringing, valuabl...",fantasy
1,50185,Entity Project,"[director, friend, renting, haunted, house, ca...",horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,"[educational, video, family, family, therapist...",family
3,78522,Blood Glacier,"[scientist, working, austrian, alp, discover, ...",scifi
4,2206,Apat na anino,"[buy, day, four, men, widely, apart, life, nig...",action
5,46636,Le démon dans l'île,"[island, thing, start, going, crazy, island, h...",horror
6,53777,Candid,"[video, voyeur, stalk, woman, city, digital, c...",horror
7,24872,Hired,"[twin, brother, separated, birth, world, apart...",crime
8,17224,Miruthan,"[traffic, police, officer, team, friend, docto...",adventure
9,54900,Until You See Me,"[legendary, tale, unravels]",mystery


In [7]:
# Vectorize 

vectorizer = TfidfVectorizer(max_features=5000)  # limit vocab size if big
X_tfidf = vectorizer.fit_transform(df["synopsis"].apply(lambda x: " ".join(x)))

print(X_tfidf.shape)  # (num_samples, vocab_size)

(54000, 5000)


In [8]:
y = df["genre"]
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [9]:
# Train
clf = LogisticRegression(max_iter=200, class_weight="balanced")
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

      action       0.28      0.25      0.26      1094
   adventure       0.27      0.24      0.25      1067
       crime       0.37      0.39      0.38      1134
      family       0.39      0.46      0.42      1049
     fantasy       0.30      0.28      0.29      1057
      horror       0.40      0.43      0.41      1116
     mystery       0.30      0.28      0.29      1074
     romance       0.48      0.58      0.52      1075
       scifi       0.40      0.44      0.42      1077
    thriller       0.22      0.16      0.19      1057

    accuracy                           0.35     10800
   macro avg       0.34      0.35      0.34     10800
weighted avg       0.34      0.35      0.34     10800

