In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report



In [23]:
# Load data
train_plots = []
train_genres = []

with open("train_data.txt", "r", encoding="utf-8") as file:
    for line in file:
        parts = line.split(" ::: ")
        if len(parts) == 4:
            genre = parts[2].strip()
            plot = parts[3].strip()
            train_genres.append(genre)
            train_plots.append(plot)


In [24]:
test_ids = []
test_plots = []

with open("test_data.txt", "r", encoding="utf-8") as file:
    for line in file:
        parts = line.split(" ::: ")
        if len(parts) == 3:
            movie_id = parts[0].strip()
            plot = parts[2].strip()
            test_ids.append(movie_id)
            test_plots.append(plot)


In [25]:
solution_genres = {}

with open("test_data_solution.txt", "r", encoding="utf-8") as file:
    for line in file:
        parts = line.split(" ::: ")
        if len(parts) == 4:
            movie_id = parts[0].strip()
            genre = parts[2].strip()
            solution_genres[movie_id] = genre



In [26]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=10000,
    ngram_range=(1,2),
    min_df=2,                  # ignore extremely rare words
    max_df=0.9   
)
X_train_tfidf = vectorizer.fit_transform(train_plots)
X_test_tfidf = vectorizer.transform(test_plots)


In [27]:
true_genres = [solution_genres[mid] for mid in test_ids]


In [28]:
svm_model = LinearSVC(C=1.0, class_weight='balanced', max_iter=5000)
svm_model.fit(X_train_tfidf, train_genres)

svm_pred = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(true_genres, svm_pred))
print(classification_report(true_genres, svm_pred))




SVM Accuracy: 0.5111992619926199
              precision    recall  f1-score   support

      action       0.30      0.43      0.36      1314
       adult       0.36      0.57      0.44       590
   adventure       0.21      0.31      0.25       775
   animation       0.16      0.24      0.20       498
   biography       0.03      0.06      0.04       264
      comedy       0.59      0.49      0.54      7446
       crime       0.13      0.23      0.16       505
 documentary       0.76      0.70      0.73     13096
       drama       0.68      0.47      0.56     13612
      family       0.15      0.27      0.19       783
     fantasy       0.10      0.16      0.13       322
   game-show       0.62      0.69      0.66       193
     history       0.07      0.10      0.08       243
      horror       0.51      0.65      0.57      2204
       music       0.42      0.63      0.51       731
     musical       0.12      0.20      0.15       276
     mystery       0.08      0.13      0.10     

In [29]:
lr_model = LogisticRegression(C=2.0, max_iter=1000, class_weight='balanced')
lr_model.fit(X_train_tfidf, train_genres)
lr_pred = lr_model.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(true_genres, lr_pred))
print(classification_report(true_genres, lr_pred))



Logistic Regression Accuracy: 0.4997970479704797
              precision    recall  f1-score   support

      action       0.34      0.49      0.40      1314
       adult       0.36      0.63      0.46       590
   adventure       0.22      0.34      0.26       775
   animation       0.18      0.28      0.22       498
   biography       0.05      0.13      0.07       264
      comedy       0.61      0.49      0.54      7446
       crime       0.15      0.35      0.21       505
 documentary       0.81      0.61      0.70     13096
       drama       0.71      0.43      0.54     13612
      family       0.15      0.33      0.21       783
     fantasy       0.13      0.23      0.16       322
   game-show       0.63      0.72      0.67       193
     history       0.08      0.21      0.12       243
      horror       0.56      0.65      0.60      2204
       music       0.40      0.71      0.51       731
     musical       0.16      0.31      0.21       276
     mystery       0.10      0.1

In [30]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, train_genres)

nb_pred = nb_model.predict(X_test_tfidf)

print("Naive Bayes Accuracy:", accuracy_score(true_genres, nb_pred))
print(classification_report(true_genres, nb_pred,zero_division=0))


Naive Bayes Accuracy: 0.5263837638376384
              precision    recall  f1-score   support

      action       0.61      0.08      0.15      1314
       adult       0.56      0.06      0.11       590
   adventure       0.80      0.09      0.16       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.52      0.44      0.48      7446
       crime       0.00      0.00      0.00       505
 documentary       0.57      0.89      0.70     13096
       drama       0.46      0.83      0.59     13612
      family       0.00      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.98      0.21      0.34       193
     history       0.00      0.00      0.00       243
      horror       0.72      0.33      0.45      2204
       music       0.91      0.04      0.08       731
     musical       0.00      0.00      0.00       276
     mystery       0.00      0.00      0

In [31]:
print("MODEL COMPARISON SUMMARY")
print("------------------------")
print("SVM Accuracy:", accuracy_score(true_genres, svm_pred))
print("Logistic Regression Accuracy:", accuracy_score(true_genres, lr_pred))
print("Naive Bayes Accuracy:", accuracy_score(true_genres, nb_pred))


MODEL COMPARISON SUMMARY
------------------------
SVM Accuracy: 0.5111992619926199
Logistic Regression Accuracy: 0.4997970479704797
Naive Bayes Accuracy: 0.5263837638376384


In [34]:
import joblib

# Save trained model
joblib.dump(svm_model, "genre_model.pkl")

# Save TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and Vectorizer saved successfully!")


Model and Vectorizer saved successfully!
