MOVIE GENRE CLASSIFICATION

1. LOADING DATASET AND SOLUTION PATH

In [None]:
import pandas as pd

def load_train(path):
    records = []
    with open(path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            # skip headers or blank lines
            if not line or line.startswith("ID :::"):
                continue
            parts = [p.strip() for p in line.split(" ::: ")]
            if len(parts) == 4:
                _, title, genre, descr = parts
                records.append({
                    "title": title,
                    "genre": genre,
                    "description": descr
                })
    return pd.DataFrame(records)

train_path = r"D:\Growthlink\Task 1\train_data.txt"
test_path  = r"D:\Growthlink\Task 1\test_data.txt"
solution_path = r"D:\Growthlink\Task 1\test_data_solution.txt"


df = load_train(train_path)
print("Training samples:", df.shape)
print(df.genre.value_counts())


Training samples: (54214, 3)
genre
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family           784
adventure        775
music            731
romance          672
sci-fi           647
adult            590
crime            505
animation        498
sport            432
talk-show        391
fantasy          323
mystery          319
musical          277
biography        265
history          243
game-show        194
news             181
war              132
Name: count, dtype: int64


2.SPLITTING THE DATASET

In [2]:
from sklearn.model_selection import train_test_split

X = df["description"]
y = df["genre"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape, "Validation:", X_val.shape)


Train: (43371,) Validation: (10843,)


3. DATA PREPARATION - Converting raw data into vector formats using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10_000,
    ngram_range=(1, 2),
    stop_words="english",
    lowercase=True
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf   = vectorizer.transform(X_val)

print("TF–IDF shape:", X_train_tfidf.shape)


TF–IDF shape: (43371, 10000)


4. MODEL SELECTION AND DEFINING

    i)   Support Vector Machine  
    ii)  Random Forest  
    iii) XGBoost  
    iv)  Multilayer Perceptron  

**Note:** Ensembling the models using voting.

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

clf_svm = SVC(
    kernel="linear",
    probability=True,
    random_state=42
)

clf_rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

# GPU Enabled for faster training
clf_xgb = XGBClassifier(
    tree_method="gpu_hist",        
    predictor="gpu_predictor",     
    use_label_encoder=False,
    eval_metric="mlogloss",
    random_state=42
)

clf_mlp = MLPClassifier(
    hidden_layer_sizes=(100,),
    max_iter=300,
    random_state=42
)


#ensembling all the models trained above
ensemble = VotingClassifier(
    estimators=[
        ("svm", clf_svm),
        ("rf",  clf_rf),
        ("xgb", clf_xgb),
        ("mlp", clf_mlp)
    ],
    voting="soft"
)



5, TRAINING THE ENSEMBLE MODEL

In [None]:
# Train the ensemble model
ensemble.fit(X_train_tfidf, y_train)
print("Ensemble trained on", X_train_tfidf.shape[0], "samples")


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Ensemble trained on 43371 samples


6. EVALUATION OF THE MODEL

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = ensemble.predict(X_val_tfidf)
print("Classification Report:\n", classification_report(y_val, y_pred))


Classification Report:
               precision    recall  f1-score   support

      action       0.45      0.33      0.38       263
       adult       0.71      0.47      0.56       118
   adventure       0.44      0.23      0.31       155
   animation       0.35      0.20      0.25       100
   biography       0.00      0.00      0.00        53
      comedy       0.52      0.54      0.53      1490
       crime       0.29      0.08      0.12       101
 documentary       0.69      0.83      0.75      2619
       drama       0.55      0.72      0.63      2723
      family       0.38      0.12      0.18       157
     fantasy       0.20      0.03      0.05        65
   game-show       0.89      0.64      0.75        39
     history       0.43      0.06      0.11        49
      horror       0.64      0.57      0.60       441
       music       0.62      0.47      0.53       146
     musical       0.12      0.04      0.06        55
     mystery       0.20      0.03      0.05        64
   

TESTING USING DATASET AND STORING THE OUTPUT IN A CSV

In [None]:
import pandas as pd

def load_test(path):
    """
    Reads lines like:
      54199 ::: Movie Title ::: Plot description...
    Returns lists of IDs and descriptions.
    """
    ids, texts = [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("ID :::"):
                continue
            parts = [p.strip() for p in line.split(" ::: ")]
            
            if len(parts) == 3:
                idx, _title, descr = parts
                ids.append(idx)
                texts.append(descr)
            
            elif len(parts) >= 4:
                idx, _title, _genre, descr = parts[:4]
                ids.append(idx)
                texts.append(descr)
            
            else:
                print("Skipping malformed line:", line)
    
    print(f"[INFO] Loaded {len(texts)} samples from test set.")
    return ids, texts

# Paths (reuse from your earlier code)
test_path = r"D:\Growthlink\Task 1\test_data.txt"

# Load & parse
test_ids, test_texts = load_test(test_path)

# Only proceed if we have test samples
if test_texts:
    # Vectorize
    test_tfidf = vectorizer.transform(test_texts)

    # Predict
    test_preds = ensemble.predict(test_tfidf)

    # Build output DataFrame
    out_df = pd.DataFrame({
        "ID": test_ids,
        "PredictedGenre": test_preds
    })

    # Save to CSV
    out_df.to_csv("movie_genre_predictions.csv", index=False)
    print("[SUCCESS] Saved predictions to movie_genre_predictions.csv")

    # (Optional) Quick peek
    print(out_df.head())
else:
    print("[ERROR] No valid data found in test set – check your file format.")


[INFO] Loaded 54200 samples from test set.
[SUCCESS] Saved predictions to movie_genre_predictions.csv
  ID PredictedGenre
0  1          short
1  2          drama
2  3    documentary
3  4          drama
4  5          drama


SAVING THE MODEL FILE

In [15]:
import joblib

# After training, call these lines to persist your objects:
joblib.dump(ensemble, 'ensemble_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("[INFO] Saved ensemble model to 'ensemble_model.pkl' and vectorizer to 'tfidf_vectorizer.pkl'")


[INFO] Saved ensemble model to 'ensemble_model.pkl' and vectorizer to 'tfidf_vectorizer.pkl'
