In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [10]:
#loding dataset
df = pd.read_csv("F:/tmdb_5000_movies.csv") 
print(df.columns)


Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


In [28]:
df.head()

Unnamed: 0,plot,genre
0,"In the 22nd century, a paraplegic Marine is di...",Action
1,"Captain Barbossa, long believed to be dead, ha...",Adventure
2,A cryptic message from Bond’s past sends him o...,Action
3,Following the death of District Attorney Harve...,Action
4,"John Carter is a war-weary, former military ca...",Action


In [11]:
#Extract plot and genres
df = df[['overview', 'genres']].dropna()


In [14]:
#Parse JSON-like genre strings to extract the first genre
import json
def extract_primary_genre(genre_str):
    genres = json.loads(genre_str.replace("'", '"'))
    if len(genres) > 0:
        return genres[0]['name']
    return None

In [15]:
df['genre'] = df['genres'].apply(extract_primary_genre)
df = df[['overview', 'genre']].dropna()
df.columns = ['plot', 'genre']


In [16]:
# Feature & Label
X = df['plot']
y = df['genre']

In [17]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_vec = tfidf.fit_transform(X)


In [18]:
#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)


In [19]:
#Model Training
model = LogisticRegression(max_iter=300)
model.fit(X_train, y_train)


In [20]:
#Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.40209424083769635

Classification Report:
                  precision    recall  f1-score   support

         Action       0.46      0.54      0.50       153
      Adventure       0.40      0.03      0.05        76
      Animation       0.00      0.00      0.00        20
         Comedy       0.42      0.66      0.51       203
          Crime       0.00      0.00      0.00        40
    Documentary       0.00      0.00      0.00        12
          Drama       0.36      0.69      0.48       232
         Family       0.00      0.00      0.00        13
        Fantasy       0.00      0.00      0.00        21
        History       0.00      0.00      0.00         5
         Horror       0.62      0.07      0.12        75
          Music       0.00      0.00      0.00         9
        Mystery       0.00      0.00      0.00        12
        Romance       0.00      0.00      0.00        21
Science Fiction       0.00      0.00      0.00        20
       Thriller       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
#Save Model and Vectorizer
import pickle
with open("movie_genre_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)