## ==============================================
## Movie Genre Classification Using TF-IDF + SVM
## ==============================================

## -----------------------
## 1. Import Libraries
## -----------------------

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
import numpy as np


## -----------------------
## 2. Load Dataset
## -----------------------
## Training data has: ID, TITLE, GENRE, DESCRIPTION

In [4]:
train_df = pd.read_csv("train_data.txt", sep=" ::: ", engine='python', 
                       names=["ID","TITLE","GENRE","DESCRIPTION"])

# Test data has: ID, TITLE, DESCRIPTION
test_df = pd.read_csv("test_data.txt", sep=" ::: ", engine='python', 
                      names=["ID","TITLE","DESCRIPTION"])

## -----------------------
## 3. Handle Imbalanced Classes (Optional but recommended)
## -----------------------
## Find the maximum class size

In [6]:
max_size = train_df['GENRE'].value_counts().max()

oversampled_list = []
for genre, group in train_df.groupby('GENRE'):
    if len(group) < max_size:
        group_upsampled = resample(group,
                                   replace=True,
                                   n_samples=max_size,
                                   random_state=42)
        oversampled_list.append(group_upsampled)
    else:
        oversampled_list.append(group)

train_df_balanced = pd.concat(oversampled_list).sample(frac=1, random_state=42).reset_index(drop=True)

## -----------------------
## 4. Combine Text Features
## -----------------------
## Combine TITLE + DESCRIPTION for richer textual information

In [8]:
train_texts = (train_df_balanced["TITLE"] + " " + train_df_balanced["DESCRIPTION"]).values
y_train = train_df_balanced["GENRE"]
test_texts = (test_df["TITLE"] + " " + test_df["DESCRIPTION"]).values

## -----------------------
## 5. Vectorization: TF-IDF
## -----------------------
## Convert text into numerical features

In [10]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=30000, ngram_range=(1,3))
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)


## -----------------------
## 6. Compute Class Weights
## -----------------------
## Ensures rare genres are given more importance during training

In [12]:
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))

## -----------------------
## 7. Train Linear SVM Classifier
## -----------------------

In [None]:
svm = LinearSVC(class_weight=class_weight_dict, max_iter=5000)
svm.fit(X_train, y_train)



## -----------------------
## 8. Predict on Test Data
## -----------------------

In [None]:
preds = svm.predict(X_test)


# -----------------------
# 9. Save Predictions
# -----------------------

In [None]:
output_file = "predictions.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for idx, row in test_df.iterrows():
        f.write(f"{row['ID']} ::: {row['TITLE']} ::: {preds[idx]}\n")

print(f"Predictions saved to {output_file}")

In [None]:
# -----------------------
# 10. Evaluate Model (if ground truth available)
# -----------------------
test_solution = pd.read_csv("test_data_solution.txt", sep=" ::: ", engine='python', 
                            names=["ID","TITLE","GENRE","DESCRIPTION"])
y_true = test_solution["GENRE"]
print("Accuracy:", accuracy_score(y_true, preds))
print(classification_report(y_true, preds))