In [1]:
import numpy as np
import pandas as pd
import tqdm as tlsd
import sklearn.naive_bayes as nb
import sklearn.feature_extraction.text as fe_text
import sklearn.metrics as metrics
import sklearn.preprocessing as preproc
import sklearn.multioutput as multiout

In [2]:
genre_options = [
    'action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime',
    'documentary', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 
    'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi', 'short', 
    'sport', 'talk-show', 'thriller', 'war', 'western'
]


unknown_genre = 'Unknown'

In [5]:
try:
    print("Loading and preprocessing training data...")
    train_progress = tlsd.tqdm(total=50, desc="Loading Training Data")
    train_data = pd.read_csv('train_data.txt', sep=':::', header=None, 
                             names=['ID', 'TITLE', 'GENRE', 'SUMMARY'], engine='python')
    train_progress.update(50)
    train_progress.close()
except Exception as error:
    print(f"Error loading training data: {error}")
    raise

Loading and preprocessing training data...


Loading Training Data: 100%|███████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 72.15it/s]


In [6]:
train_summaries = train_data['SUMMARY'].astype(str).str.lower()
train_genres = [labels.split(', ') for labels in train_data['GENRE']]
mlb = preproc.MultiLabelBinarizer()
y_train_data = mlb.fit_transform(train_genres)

In [7]:
print("Vectorizing training data...")
tfidf_vectorizer = fe_text.TfidfVectorizer(max_features=5000)
tfidf_progress = tlsd.tqdm(total=50, desc="Vectorizing Training Data")
X_train_vectors = tfidf_vectorizer.fit_transform(train_summaries)
tfidf_progress.update(50)
tfidf_progress.close()

Vectorizing training data...


Vectorizing Training Data: 100%|███████████████████████████████████████████████████████| 50/50 [00:10<00:00,  4.72it/s]


In [8]:
# Train the multi-label classification model using Naive Bayes
print("Training model...")
classifier = multiout.MultiOutputClassifier(nb.MultinomialNB())
model_progress = tlsd.tqdm(total=50, desc="Training Model")
classifier.fit(X_train_vectors, y_train_data)
model_progress.update(50)
model_progress.close()

Training model...


Training Model: 100%|██████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 36.68it/s]


In [9]:
# Load and preprocess test data
try:
    print("Loading and preprocessing test data...")
    test_progress = tlsd.tqdm(total=50, desc="Loading Test Data")
    test_data = pd.read_csv('test_data.txt', sep=':::', header=None, 
                            names=['ID', 'TITLE', 'SUMMARY'], engine='python')
    test_progress.update(50)
    test_progress.close()
except Exception as error:
    print(f"Error loading test data: {error}")
    raise

Loading and preprocessing test data...


Loading Test Data: 100%|███████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 67.58it/s]


In [10]:
# Convert test summaries to lowercase and transform using TF-IDF
test_summaries = test_data['SUMMARY'].astype(str).str.lower()
print("Vectorizing test data...")
X_test_vectors = tfidf_vectorizer.transform(test_summaries)


Vectorizing test data...


In [11]:
# Predict genres for the test data
print("Predicting genres for test data...")
prediction_progress = tlsd.tqdm(total=50, desc="Making Predictions")
y_predicted = classifier.predict(X_test_vectors)
prediction_progress.update(50)
prediction_progress.close()

Predicting genres for test data...


Making Predictions: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 53.20it/s]


In [14]:
# Save predicted genres to an output file instead of printing
predicted_genres_list = mlb.inverse_transform(y_predicted)
with open("predicted_genres_output.txt", "w", encoding="utf-8") as output_file:
    for idx, title in enumerate(test_data['TITLE']):
        genres = predicted_genres_list[idx] if predicted_genres_list[idx] else [unknown_genre]
        genre_output = ', '.join(genres)
        output_file.write(f"{title} ::: {genre_output}\n")

print("Predicted genres have been saved to 'predicted_genres_output.txt'.")


Predicted genres have been saved to 'predicted_genres_output.txt'.


In [19]:
# Evaluate model performance on the training set
print("\nCalculating evaluation metrics...")
y_train_pred = classifier.predict(X_train_vectors)
accuracy = metrics.accuracy_score(y_train_data, y_train_pred)
precision = metrics.precision_score(y_train_data, y_train_pred, average='micro')
recall = metrics.recall_score(y_train_data, y_train_pred, average='micro')
f1 = metrics.f1_score(y_train_data, y_train_pred, average='micro')

# Print the model evaluation metrics


print("Genre predictions and model metrics displayed successfully.")


Calculating evaluation metrics...
Genre predictions and model metrics displayed successfully.


In [20]:
# Directly define a movie plot for genre prediction
movie_plot_input = "The film's title refers not only to the un-recovered bodies at ground zero, but also to the state of the nation at large. Set in the hallucinatory period of time between September 11 and Halloween of 2001, The Unrecovered examines the effect of terror on the average mind, the way a state of heightened anxiety and/or alertness can cause the average person to make the sort of imaginative connections that are normally made only by artists and conspiracy theorists-both of whom figure prominently in this film. The Unrecovered explores the way in which irony, empathy, and paranoia relate to one another in the wake of 9/11." 

# Preprocess and vectorize the input plot
movie_plot_processed = [movie_plot_input.lower()]  # Convert to lowercase for consistency
movie_plot_vector = tfidf_vectorizer.transform(movie_plot_processed)

# Predict genres for the input plot
predicted_genres = classifier.predict(movie_plot_vector)
predicted_genres_list = mlb.inverse_transform(predicted_genres)

# Display the predicted genres
if predicted_genres_list[0]:
    genres_output = ', '.join(predicted_genres_list[0])
else:
    genres_output = unknown_genre

print(f"Predicted Genres for the given plot: {genres_output}")


Predicted Genres for the given plot:  documentary 
