In [7]:
# Full working code for genre classification
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# --- Step 1: Load dataset safely ---
train_path = r"C:\Users\Niteen\Downloads\archive\Genre Classification Dataset\train_data.txt"

# Read file line by line to avoid CSV parsing issues
with open(train_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Split each line by " ::: " into columns
data = [line.strip().split(" ::: ") for line in lines]

# Convert to DataFrame
train_df = pd.DataFrame(data, columns=['id', 'title', 'genre', 'plot'])

# --- Step 2: Basic preprocessing ---
# Remove rows with missing plot or genre
train_df = train_df.dropna(subset=['plot', 'genre'])

# Optional: keep only top genres to avoid rare-class issues
top_genres = train_df['genre'].value_counts().nlargest(10).index
train_df = train_df[train_df['genre'].isin(top_genres)]

# --- Step 3: Split into train and validation sets ---
X = train_df['plot']
y = train_df['genre']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Step 4: Vectorize text ---
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

# --- Step 5: Train classifier ---
clf = LogisticRegression(max_iter=1000, multi_class='ovr')
clf.fit(X_train_vec, y_train)

# --- Step 6: Evaluate model ---
y_pred = clf.predict(X_val_vec)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred, zero_division=0))




Validation Accuracy: 0.6484

Classification Report:
              precision    recall  f1-score   support

      action       0.64      0.26      0.37       263
      comedy       0.59      0.58      0.59      1489
 documentary       0.71      0.88      0.79      2619
       drama       0.60      0.77      0.68      2723
      family       0.92      0.08      0.14       157
      horror       0.75      0.58      0.65       441
  reality-tv       0.55      0.10      0.16       177
       short       0.56      0.29      0.38      1015
    thriller       0.63      0.11      0.19       318
     western       0.96      0.71      0.82       206

    accuracy                           0.65      9408
   macro avg       0.69      0.44      0.48      9408
weighted avg       0.65      0.65      0.62      9408

