In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
# Load the training data
df = pd.read_csv("data/train_data.csv")

In [3]:
# Clean missing values (i.e., rows with missing overviews or genres)
df.dropna(subset=["overview", "genre"], inplace=True)

In [4]:
# Encode genre labels (convert genre text to numeric labels)
label_encoder = LabelEncoder()
df["genre_encoded"] = label_encoder.fit_transform(df["genre"])

In [5]:
# Vectorize the "overview" text using TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df["overview"])

In [6]:
# Target variable: genre labels (numeric)
y = df["genre_encoded"]

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Train a Naive Bayes model on the training set
model = MultinomialNB()
model.fit(X_train, y_train)

In [10]:
# Predict genres on the test set and evaluate the model
y_pred = model.predict(X_test)
print("✅ Model trained successfully!")
print("Accuracy:", accuracy_score(y_test, y_pred))

# Modify the classification report to handle zero divisions
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))


✅ Model trained successfully!
Accuracy: 0.52310246241815

Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.08      0.14       263
           1       0.88      0.06      0.12       112
           2       0.29      0.03      0.05       139
           3       0.00      0.00      0.00       104
           4       0.00      0.00      0.00        61
           5       0.51      0.44      0.47      1443
           6       0.00      0.00      0.00       107
           7       0.58      0.88      0.70      2659
           8       0.46      0.83      0.59      2697
           9       1.00      0.01      0.01       150
          10       0.00      0.00      0.00        74
          11       1.00      0.15      0.26        40
          12       0.00      0.00      0.00        45
          13       0.73      0.38      0.50       431
          14       0.79      0.10      0.18       144
          15       0.00      0.00      0.00        50

In [11]:
# Save the model, vectorizer, and label encoder for later use
joblib.dump(model, "models/naive_bayes_model.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")
joblib.dump(label_encoder, "models/label_encoder.pkl")
print("✅ Model, vectorizer, and encoder saved!")

✅ Model, vectorizer, and encoder saved!


In [12]:
# Load test data (a separate dataset for predictions)
test_df = pd.read_csv("data/test_data.txt", sep=" ::: ", engine="python", header=None, names=["id", "title", "overview"])

In [13]:
# Clean missing overviews in the test dataset
test_df.dropna(subset=["overview"], inplace=True)

In [14]:
# Vectorize the test data using the same vectorizer
X_test_final = vectorizer.transform(test_df["overview"])

In [15]:
# Predict genres for the test data
test_preds = model.predict(X_test_final)

In [16]:
# Decode the predicted genres back to their original names
test_df["predicted_genre"] = label_encoder.inverse_transform(test_preds)

In [17]:
# Display predictions for the test data
print("\n🎬 Predicted genres for test data:\n")
print(test_df[["id", "title", "predicted_genre"]].head())


🎬 Predicted genres for test data:

   id                        title predicted_genre
0   1         Edgar's Lunch (1998)           drama
1   2     La guerra de papá (1977)           drama
2   3  Off the Beaten Track (2010)     documentary
3   4       Meu Amigo Hindu (2015)           drama
4   5            Er nu zhai (1955)           drama


In [18]:
# Save the predictions to a CSV file
test_df.to_csv("data/test_data_with_predictions.csv", index=False)

In [19]:
# Function to load the saved model, vectorizer, and label encoder for prediction
def load_model():
    model = joblib.load("models/naive_bayes_model.pkl")
    vectorizer = joblib.load("models/tfidf_vectorizer.pkl")
    label_encoder = joblib.load("models/label_encoder.pkl")
    return model, vectorizer, label_encoder

In [20]:
# Function to predict the genre of a movie description
def predict_genre(description):
    model, vectorizer, label_encoder = load_model()
    description_vectorized = vectorizer.transform([description])
    genre_encoded = model.predict(description_vectorized)[0]
    genre = label_encoder.inverse_transform([genre_encoded])[0]
    return genre

In [None]:
# Test the prediction function with user input
user_input = input("Enter a movie description: ")