In [14]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [16]:
# File paths
train_path = "C:\\Users\\sachi\\Desktop\\New folder (2)\\Genre Classification Dataset\\train_data.txt"
test_path = "C:\\Users\\sachi\\Desktop\\New folder (2)\\Genre Classification Dataset\\test_data.txt"
test_solution_path ="C:\\Users\\sachi\\Desktop\\New folder (2)\\Genre Classification Dataset\\test_data_solution.txt"

# Load data
def load_data(path, is_train=True):
    with open(path, encoding='utf-8') as f:
        lines = f.read().splitlines()
    data = []
    for line in lines:
        parts = line.split(" ::: ")
        if is_train and len(parts) == 4:
            data.append((parts[0], parts[1], parts[2], parts[3]))
        elif not is_train and len(parts) == 3:
            data.append((parts[0], parts[1], parts[2]))
    if is_train:
        return pd.DataFrame(data, columns=["ID", "Title", "Genre", "Description"])
    else:
        return pd.DataFrame(data, columns=["ID", "Title", "Description"])

# Load datasets
train_df = load_data(train_path)
test_df = load_data(test_path, is_train=False)
test_solution_df = load_data(test_solution_path)

# Merge test data with ground truth genres
test_df["Genre"] = test_solution_df["Genre"]


In [17]:
# Features and labels
X_train = train_df["Description"]
y_train = train_df["Genre"]
X_test = test_df["Description"]
y_test = test_df["Genre"]

# Classifiers
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": LinearSVC()
}

# Train and evaluate
for name, model in models.items():
    print(f"\n=== {name} ===")
    pipe = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
        ('clf', model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



=== Naive Bayes ===
Accuracy: 0.5238560885608856


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
               precision    recall  f1-score   support

      action       0.55      0.11      0.18      1314
       adult       0.51      0.06      0.11       590
   adventure       0.81      0.07      0.13       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.51      0.42      0.46      7446
       crime       0.00      0.00      0.00       505
 documentary       0.57      0.87      0.69     13096
       drama       0.46      0.82      0.59     13612
      family       0.50      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.98      0.32      0.48       193
     history       0.00      0.00      0.00       243
      horror       0.69      0.36      0.47      2204
       music       0.74      0.15      0.25       731
     musical       0.00      0.00      0.00       276
     mystery       0.00      0.00      0.00       318
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
               precision    recall  f1-score   support

      action       0.48      0.29      0.36      1314
       adult       0.60      0.24      0.34       590
   adventure       0.59      0.17      0.26       775
   animation       0.53      0.07      0.12       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.58      0.55      7446
       crime       0.36      0.04      0.07       505
 documentary       0.67      0.85      0.75     13096
       drama       0.54      0.77      0.64     13612
      family       0.50      0.09      0.15       783
     fantasy       0.56      0.06      0.10       322
   game-show       0.91      0.51      0.65       193
     history       0.00      0.00      0.00       243
      horror       0.64      0.57      0.61      2204
       music       0.67      0.45      0.54       731
     musical       0.33      0.02      0.04       276
     mystery       0.36      0.02      0.03       318
   



Accuracy: 0.5712361623616237
Classification Report:
               precision    recall  f1-score   support

      action       0.38      0.31      0.34      1314
       adult       0.51      0.36      0.43       590
   adventure       0.38      0.20      0.26       775
   animation       0.30      0.14      0.19       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.55      0.54      7446
       crime       0.18      0.06      0.09       505
 documentary       0.69      0.82      0.75     13096
       drama       0.56      0.71      0.63     13612
      family       0.31      0.13      0.18       783
     fantasy       0.27      0.11      0.15       322
   game-show       0.74      0.60      0.66       193
     history       0.17      0.02      0.04       243
      horror       0.58      0.60      0.59      2204
       music       0.58      0.49      0.53       731
     musical       0.28      0.08      0.12       276
     mystery       0.19     

In [19]:
import gensim.downloader as api
print("Loading GloVe vectors...")
w2v = api.load("glove-wiki-gigaword-100")  # 100-dimensional vectors


Loading GloVe vectors...


In [20]:
# Preprocessing
def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    return [word for word in text.split() if word in w2v]

# Average Word2Vec
def get_vector(text):
    words = preprocess(text)
    if not words:
        return np.zeros(w2v.vector_size)
    return np.mean([w2v[word] for word in words], axis=0)

# Encode genres
label_encoder = LabelEncoder()
y_train_vec = label_encoder.fit_transform(train_df["Genre"])
y_test_vec = label_encoder.transform(test_df["Genre"])

# Vectorize descriptions
X_train_vec = np.vstack([get_vector(desc) for desc in tqdm(train_df["Description"])])
X_test_vec = np.vstack([get_vector(desc) for desc in tqdm(test_df["Description"])])


100%|██████████| 54214/54214 [00:09<00:00, 5698.93it/s]
100%|██████████| 54200/54200 [00:09<00:00, 5931.63it/s]


In [21]:
# Logistic Regression on Word2Vec
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train_vec)
y_pred_vec = clf.predict(X_test_vec)

# Results
print("Accuracy:", accuracy_score(y_test_vec, y_pred_vec))
print("Classification Report:\n", classification_report(y_test_vec, y_pred_vec, target_names=label_encoder.classes_))


Accuracy: 0.5361623616236162
Classification Report:
               precision    recall  f1-score   support

      action       0.40      0.27      0.32      1314
       adult       0.44      0.21      0.28       590
   adventure       0.31      0.08      0.13       775
   animation       0.30      0.07      0.11       498
   biography       0.00      0.00      0.00       264
      comedy       0.47      0.49      0.48      7446
       crime       0.26      0.05      0.08       505
 documentary       0.64      0.80      0.71     13096
       drama       0.51      0.74      0.60     13612
      family       0.35      0.08      0.12       783
     fantasy       0.39      0.06      0.10       322
   game-show       0.76      0.50      0.60       193
     history       0.33      0.02      0.03       243
      horror       0.50      0.46      0.48      2204
       music       0.62      0.51      0.56       731
     musical       0.13      0.01      0.02       276
     mystery       0.14     

In [31]:
def predict_genre(text, model, encoder):
    vec = get_vector(text).reshape(1, -1)
    pred = model.predict(vec)[0]
    return encoder.inverse_transform([pred])[0]

# Try it
new_plot = "Two strangers meet by chance in Paris and fall in love, but fate keeps pulling them apart."
print("Predicted Genre:", predict_genre(new_plot, clf, label_encoder))


Predicted Genre: drama
