Importing modules

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import re
import nltk

# Ensure nltk data is available
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


loading dataset & preprocessing

In [21]:
# Load the cleaned dataset
df = pd.read_csv('cleaned_movies_metadata.csv')
# Handle NaN values in 'overview' column
df['overview'] = df['overview'].fillna('')

# Preprocess the 'plot_summary' column
df["Processed_Summary"] = df["overview"].apply(preprocess_text)

# Ensure there are no missing values in 'Processed_Summary' and 'genres'
df = df.dropna(subset=['Processed_Summary', 'genres'])

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Processed_Summary']).toarray()

Setting target variable and training the model

In [22]:
# Target variable
y = df['genres']

# Encode the target variable
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# Check for any NaN values in X and y (although it should be clean now)
assert not pd.DataFrame(X).isnull().any().any(), "X contains NaN values"
assert not pd.Series(y_encoded).isnull().any(), "y contains NaN values"

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Model training using Multinomial Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

Evaluating model

In [23]:
# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Identify the unique classes in the test set
unique_test_classes = sorted(pd.Series(y_test).unique())

# Use only the unique classes in the test set for the target names
target_names = encoder.inverse_transform(unique_test_classes)
report = classification_report(y_test, y_pred, target_names=target_names)

print(f'Accuracy: {accuracy}')
print(report)

# Debugging: Check unique classes in encoder and in y_test
print(f"Classes in encoder: {list(encoder.classes_)}")
print(f"Classes in y_test: {unique_test_classes}")

Accuracy: 0.19907030796048808
                                                                                      precision    recall  f1-score   support

                                                                              Action       0.33      0.02      0.04        53
                                                                   Action, Adventure       0.00      0.00      0.00        19
                                        Action, Adventure, Animation, Comedy, Family       0.00      0.00      0.00         1
                               Action, Adventure, Animation, Comedy, Science Fiction       0.00      0.00      0.00         1
                                                Action, Adventure, Animation, Family       0.00      0.00      0.00         3
                                       Action, Adventure, Animation, Family, Fantasy       0.00      0.00      0.00         1
                                       Action, Adventure, Animation, Family, Mystery   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Usage

In [24]:
# Example usage: predicting genre for a new movie plot
new_plot = input("Enter plot:")
processed_plot = preprocess_text(new_plot)
vectorized_plot = vectorizer.transform([processed_plot]).toarray()
predicted_genre_encoded = model.predict(vectorized_plot)

# Ensure the predicted genre is among the classes known to the encoder
if predicted_genre_encoded[0] in unique_test_classes:
    predicted_genre = encoder.inverse_transform(predicted_genre_encoded)
    print(f"Predicted Genre: {predicted_genre[0]}")
else:
    print(f"Predicted Genre (unseen): {predicted_genre_encoded[0]}")

Enter plot:A mischievous young boy, Tom Sawyer, witnesses a murder by the deadly Injun Joe. Tom becomes friends with Huckleberry Finn, a boy with no future and no family. Tom has to choose between honoring a friendship or honoring an oath because the town alcoholic is accused of the murder. Tom and Huck go through several adventures trying to retrieve evidence.
Predicted Genre: Drama
