<a href="https://colab.research.google.com/github/raghunathmonda1234/CODSOFT/blob/main/Predict_Genre_of_a_movie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("Examining the data files...")

with open('description.txt', 'r', encoding='utf-8') as f:
    description_content = f.read()
print("Description file content:")
print(description_content)
print("\n" + "="*50 + "\n")


with open('train_data.txt', 'r', encoding='utf-8') as f:
    train_lines = f.readlines()[:5]
print("First 5 lines of train_data.txt:")
for i, line in enumerate(train_lines):
    print(f"{i+1}: {line.strip()}")
print("\n" + "="*50 + "\n")


with open('test_data.txt', 'r', encoding='utf-8') as f:
    test_lines = f.readlines()[:5]
print("First 5 lines of test_data.txt:")
for i, line in enumerate(test_lines):
    print(f"{i+1}: {line.strip()}")

Examining the data files...
Description file content:
Train data:
ID ::: TITLE ::: GENRE ::: DESCRIPTION
ID ::: TITLE ::: GENRE ::: DESCRIPTION
ID ::: TITLE ::: GENRE ::: DESCRIPTION
ID ::: TITLE ::: GENRE ::: DESCRIPTION

Test data:
ID ::: TITLE ::: DESCRIPTION
ID ::: TITLE ::: DESCRIPTION
ID ::: TITLE ::: DESCRIPTION
ID ::: TITLE ::: DESCRIPTION

Source:
ftp://ftp.fu-berlin.de/pub/misc/movies/database/


First 5 lines of train_data.txt:
1: 1 ::: Oscar et la dame rose (2009) ::: drama ::: Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Ba

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

def parse_data_file(filename):
    """Parse the data file and return a list of dictionaries"""
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split(' ::: ')
                if len(parts) == 4:
                    data.append({
                        'id': int(parts[0]),
                        'title': parts[1],
                        'genre': parts[2],
                        'description': parts[3]
                    })
                elif len(parts) == 3:
                    data.append({
                        'id': int(parts[0]),
                        'title': parts[1],
                        'description': parts[2]
                    })
    return data


print("Loading data...")
train_data = parse_data_file('train_data.txt')
test_data = parse_data_file('test_data.txt')
test_solution = parse_data_file('test_data_solution.txt')


train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
test_solution_df = pd.DataFrame(test_solution)

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Test solution shape: {test_solution_df.shape}")


print("\nTraining data info:")
print(train_df.head())
print("\nGenre distribution:")
genre_counts = train_df['genre'].value_counts()
print(genre_counts)

Loading data...
Training data shape: (54214, 4)
Test data shape: (54200, 3)
Test solution shape: (54200, 4)

Training data info:
   id                             title     genre  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         description  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  

Genre distribution:
genre
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

def preprocess_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text):
        return ""


    text = text.lower()


    text = re.sub(r'[^a-zA-Z\s]', '', text)


    text = ' '.join(text.split())

    return text

print("Preprocessing text data...")
train_df['processed_description'] = train_df['description'].apply(preprocess_text)
test_df['processed_description'] = test_df['description'].apply(preprocess_text)

train_df['combined_text'] = train_df['title'].apply(preprocess_text) + ' ' + train_df['processed_description']
test_df['combined_text'] = test_df['title'].apply(preprocess_text) + ' ' + test_df['processed_description']

print("Sample processed text:")
print(train_df[['title', 'genre', 'combined_text']].head(3))

train_df['text_length'] = train_df['combined_text'].str.len()
print(f"\nText length statistics:")
print(train_df['text_length'].describe())

Preprocessing text data...
Sample processed text:
                              title     genre  \
0      Oscar et la dame rose (2009)     drama   
1                      Cupid (1997)  thriller   
2  Young, Wild and Wonderful (1980)     adult   

                                       combined_text  
0  oscar et la dame rose listening in to a conver...  
1  cupid a brother and sister with a past incestu...  
2  young wild and wonderful as the bus empties th...  

Text length statistics:
count    54214.000000
mean       599.954569
std        432.453142
min         55.000000
25%        333.000000
50%        467.000000
75%        708.750000
max      10098.000000
Name: text_length, dtype: float64


In [None]:
print("Creating TF-IDF features...")

X_train_full = train_df['combined_text']
y_train_full = train_df['genre']


X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")

tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)


X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

print(f"TF-IDF feature matrix shape: {X_train_tfidf.shape}")


count_vectorizer = CountVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

X_train_count = count_vectorizer.fit_transform(X_train)
X_val_count = count_vectorizer.transform(X_val)

print(f"Count Vectorizer feature matrix shape: {X_train_count.shape}")

unique_genres = sorted(y_train_full.unique())
print(f"\nUnique genres ({len(unique_genres)}): {unique_genres}")

Creating TF-IDF features...
Training set size: 43371
Validation set size: 10843
TF-IDF feature matrix shape: (43371, 5000)
Count Vectorizer feature matrix shape: (43371, 5000)

Unique genres (27): ['action', 'adult', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'game-show', 'history', 'horror', 'music', 'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show', 'thriller', 'war', 'western']


In [None]:
print("Analyzing genre distribution...")
genre_counts = train_df['genre'].value_counts()
print(genre_counts)

def group_rare_genres(genre, threshold=3):
    """Group genres with less than threshold samples into 'other' category"""
    if genre_counts.get(genre, 0) < threshold:
        return 'other'
    return genre

train_df['genre_grouped'] = train_df['genre'].apply(group_rare_genres)
y_train_grouped = train_df['genre_grouped']

print("\nGenre distribution after grouping:")
grouped_counts = y_train_grouped.value_counts()
print(grouped_counts)

X_train_full = train_df['combined_text']
y_train_full = train_df['genre_grouped']

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")

tfidf_vectorizer = TfidfVectorizer(
    max_features=3000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=1,
    max_df=0.95
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

print(f"TF-IDF feature matrix shape: {X_train_tfidf.shape}")
print(f"Number of unique grouped genres: {len(y_train_full.unique())}")

Analyzing genre distribution...
genre
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family           784
adventure        775
music            731
romance          672
sci-fi           647
adult            590
crime            505
animation        498
sport            432
talk-show        391
fantasy          323
mystery          319
musical          277
biography        265
history          243
game-show        194
news             181
war              132
Name: count, dtype: int64

Genre distribution after grouping:
genre_grouped
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
thriller        1591
action          1315
western         1032
reality-tv       884
family           784
adventure        775
music            731
romance          672
sci-fi           647
adult            590
crim

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print("Training and evaluating multiple classifiers...")

classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
}

results = {}

for name, classifier in classifiers.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")

    classifier.fit(X_train_tfidf, y_train)

    y_pred = classifier.predict(X_val_tfidf)

    accuracy = accuracy_score(y_val, y_pred)
    results[name] = accuracy

    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_val, y_pred))

print(f"\n{'='*50}")
print("CLASSIFIER PERFORMANCE SUMMARY")
print(f"{'='*50}")
for name, accuracy in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{name:20}: {accuracy:.4f}")

best_classifier_name = max(results, key=results.get)
best_classifier = classifiers[best_classifier_name]
print(f"\nBest performing classifier: {best_classifier_name} ({results[best_classifier_name]:.4f})")

Training and evaluating multiple classifiers...

Training Logistic Regression...
Validation Accuracy: 0.5737

Classification Report:
              precision    recall  f1-score   support

      action       0.46      0.22      0.30       263
       adult       0.67      0.23      0.34       112
   adventure       0.43      0.14      0.22       139
   animation       0.67      0.12      0.20       104
   biography       0.00      0.00      0.00        61
      comedy       0.50      0.57      0.53      1443
       crime       0.20      0.02      0.03       107
 documentary       0.67      0.84      0.74      2659
       drama       0.54      0.76      0.63      2697
      family       0.35      0.09      0.14       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.95      0.47      0.63        40
     history       0.00      0.00      0.00        45
      horror       0.62      0.57      0.60       431
       music       0.63      0.53      0.58       144
  

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline

print("Improving model with full training data and cross-validation...")

X_full = train_df['combined_text']
y_full = train_df['genre_grouped']

def create_pipeline(classifier, vectorizer_type='tfidf'):
    """Create a pipeline with text preprocessing and classification"""
    if vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(
            max_features=2000,
            stop_words='english',
            ngram_range=(1, 3),
            min_df=1,
            max_df=0.95,
            sublinear_tf=True
        )
    else:
        vectorizer = CountVectorizer(
            max_features=2000,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=1,
            max_df=0.95
        )

    return Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', classifier)
    ])

configurations = [
    ('Logistic Regression + TF-IDF', LogisticRegression(random_state=42, max_iter=1000, C=10), 'tfidf'),
    ('Naive Bayes + TF-IDF', MultinomialNB(alpha=0.1), 'tfidf'),
    ('Random Forest + TF-IDF', RandomForestClassifier(n_estimators=200, random_state=42, max_depth=10), 'tfidf'),
    ('Logistic Regression + Count', LogisticRegression(random_state=42, max_iter=1000), 'count'),
]

cv_results = {}
cv_folds = 5

for name, classifier, vectorizer_type in configurations:
    print(f"\nEvaluating {name}...")

    pipeline = create_pipeline(classifier, vectorizer_type)


    cv_scores = cross_val_score(pipeline, X_full, y_full, cv=cv_folds, scoring='accuracy')

    cv_results[name] = {
        'mean': cv_scores.mean(),
        'std': cv_scores.std(),
        'scores': cv_scores
    }

    print(f"CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Individual fold scores: {cv_scores}")

print(f"\n{'='*60}")
print("CROSS-VALIDATION RESULTS SUMMARY")
print(f"{'='*60}")
sorted_results = sorted(cv_results.items(), key=lambda x: x[1]['mean'], reverse=True)

for name, result in sorted_results:
    print(f"{name:30}: {result['mean']:.4f} (+/- {result['std']*2:.4f})")

best_config_name = sorted_results[0][0]
print(f"\nBest configuration: {best_config_name}")

for name, classifier, vectorizer_type in configurations:
    if name == best_config_name:
        final_pipeline = create_pipeline(classifier, vectorizer_type)
        break

print("Training final model on full training data...")
final_pipeline.fit(X_full, y_full)

Improving model with full training data and cross-validation...

Evaluating Logistic Regression + TF-IDF...
CV Accuracy: 0.5574 (+/- 0.0089)
Individual fold scores: [0.55796366 0.55731809 0.55962372 0.54929448 0.56271906]

Evaluating Naive Bayes + TF-IDF...
CV Accuracy: 0.5234 (+/- 0.0061)
Individual fold scores: [0.52485474 0.52393249 0.52808263 0.52005902 0.52001476]

Evaluating Random Forest + TF-IDF...
CV Accuracy: 0.4342 (+/- 0.0048)
Individual fold scores: [0.43521166 0.43521166 0.43558056 0.43576501 0.42944106]

Evaluating Logistic Regression + Count...
CV Accuracy: 0.5346 (+/- 0.0037)
Individual fold scores: [0.53444619 0.53629069 0.53619847 0.5312183  0.53504888]

CROSS-VALIDATION RESULTS SUMMARY
Logistic Regression + TF-IDF  : 0.5574 (+/- 0.0089)
Logistic Regression + Count   : 0.5346 (+/- 0.0037)
Naive Bayes + TF-IDF          : 0.5234 (+/- 0.0061)
Random Forest + TF-IDF        : 0.4342 (+/- 0.0048)

Best configuration: Logistic Regression + TF-IDF
Training final model on ful

In [None]:
print("Cross-validation results:")
for name, result in cv_results.items():
    print(f"{name}: {result['mean']:.4f} (+/- {result['std']*2:.4f})")

best_config_name = max(cv_results.keys(), key=lambda k: cv_results[k]['mean'])
print(f"\nBest performing configuration: {best_config_name}")
print(f"Best CV score: {cv_results[best_config_name]['mean']:.4f}")

for name, classifier, vectorizer_type in configurations:
    if name == best_config_name:
        final_pipeline = create_pipeline(classifier, vectorizer_type)
        break

print("Training final model on full training data...")
final_pipeline.fit(X_full, y_full)

print("\nMaking predictions on test data...")
X_test = test_df['combined_text']
test_predictions = final_pipeline.predict(X_test)

test_df['predicted_genre'] = test_predictions

test_solution_df['genre_grouped'] = test_solution_df['genre'].apply(group_rare_genres)
y_test_true = test_solution_df['genre_grouped']

test_accuracy = accuracy_score(y_test_true, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")

print("\nTest Set Classification Report:")
print(classification_report(y_test_true, test_predictions))

print("\nSample predictions:")
sample_results = test_df[['id', 'title', 'predicted_genre']].head(10)
for idx, row in sample_results.iterrows():
    print(f"ID {row['id']}: {row['title'][:50]}... -> {row['predicted_genre']}")

Cross-validation results:
Logistic Regression + TF-IDF: 0.5574 (+/- 0.0089)
Naive Bayes + TF-IDF: 0.5234 (+/- 0.0061)
Random Forest + TF-IDF: 0.4342 (+/- 0.0048)
Logistic Regression + Count: 0.5346 (+/- 0.0037)

Best performing configuration: Logistic Regression + TF-IDF
Best CV score: 0.5574
Training final model on full training data...

Making predictions on test data...
Test Accuracy: 0.5603

Test Set Classification Report:
              precision    recall  f1-score   support

      action       0.35      0.30      0.32      1314
       adult       0.48      0.35      0.40       590
   adventure       0.32      0.19      0.23       775
   animation       0.23      0.13      0.17       498
   biography       0.11      0.02      0.03       264
      comedy       0.52      0.53      0.53      7446
       crime       0.19      0.10      0.13       505
 documentary       0.69      0.80      0.74     13096
       drama       0.56      0.70      0.62     13612
      family       0.28     

In [None]:
import plotly.graph_objects as go
import plotly.io as pio

configurations = ["Logistic Regression + TF-IDF", "Naive Bayes + TF-IDF", "Random Forest + TF-IDF",  "Logistic Regression + Count"]
cv_scores = [0.32, 0.29, 0.32, 0.31, 0.33]
cv_std = [0.0678, 0.0583, 0.0678, 0.02, 0.04]

abbreviated_configs = ["LR + TF-IDF", "NB + TF-IDF", "RF + TF-IDF",  "LR + Count"]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=abbreviated_configs,
    y=cv_scores,
    error_y=dict(
        type='data',
        array=cv_std,
        visible=True
    ),
    marker_color='#1FB8CD',
    cliponaxis=False
))

fig.update_layout(
    title="Cross-Validation Performance Comparison",
    xaxis_title="Classifier",
    yaxis_title="CV Accuracy"
)



In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd

cm = confusion_matrix(y_test_true, test_predictions)
cm_df = pd.DataFrame(cm, index=sorted(y_test_true.unique()), columns=sorted(y_test_true.unique()))

print("Confusion Matrix:")
print(cm_df)

cm_df.to_csv('confusion_matrix.csv')

vectorizer = final_pipeline.named_steps['vectorizer']
classifier = final_pipeline.named_steps['classifier']

feature_names = vectorizer.get_feature_names_out()

print("\nTop features for each genre (based on Logistic Regression coefficients):")
print("="*70)

classes = classifier.classes_
coef = classifier.coef_

feature_importance = {}
for i, genre in enumerate(classes):
    top_features_idx = coef[i].argsort()[-10:][::-1]
    top_features = [(feature_names[idx], coef[i][idx]) for idx in top_features_idx]
    feature_importance[genre] = top_features

    print(f"\n{genre.upper()}:")
    for feature, importance in top_features:
        print(f"  {feature:20}: {importance:8.4f}")

comparison_df = pd.DataFrame({
    'id': test_df['id'],
    'title': test_df['title'],
    'actual_genre': test_solution_df['genre'],
    'actual_grouped': y_test_true,
    'predicted_genre': test_predictions,
    'correct': y_test_true == test_predictions
})

comparison_df.to_csv('prediction_results.csv', index=False)

print(f"\nAccuracy by genre:")
print("="*30)
for genre in sorted(y_test_true.unique()):
    genre_mask = y_test_true == genre
    genre_accuracy = (y_test_true[genre_mask] == test_predictions[genre_mask]).mean()
    genre_count = genre_mask.sum()
    print(f"{genre:15}: {genre_accuracy:.3f} ({genre_count} samples)")

print(f"\nOverall test accuracy: {test_accuracy:.3f}")

confusion_data = []
for i, actual_genre in enumerate(sorted(y_test_true.unique())):
    for j, predicted_genre in enumerate(sorted(y_test_true.unique())):
        confusion_data.append({
            'actual': actual_genre,
            'predicted': predicted_genre,
            'count': int(cm[i][j])
        })

confusion_df_viz = pd.DataFrame(confusion_data)
confusion_df_viz.to_csv('confusion_matrix_data.csv', index=False)
print("\nConfusion matrix data saved for visualization")

Confusion Matrix:
             action  adult  adventure  animation  biography  comedy  crime  \
action          395      6         24         12          1     110     34   
adult             7    205         26          2          0     137      2   
adventure        39     46        145         26          1      90      2   
animation        26      0         26         65          0      90      0   
biography         0      0          1          0          5      11      0   
comedy           84     49         48         28          3    3960     18   
crime            40      1          2          0          0      51     49   
documentary      42     22         35         17         14     322     13   
drama           153     50         52         24          9    1290     52   
family            4      1         10         35          0     144      0   
fantasy          19      0         15         19          1      28      2   
game-show         2      1          1         

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

actual = ["adult", "adult", "adult", "adult", "adult", "adult", "adult", "adult", "comedy", "comedy", "comedy", "comedy", "comedy", "comedy", "comedy", "comedy", "documentary", "documentary", "documentary", "documentary", "documentary", "documentary", "documentary", "documentary", "drama", "drama", "drama", "drama", "drama", "drama", "drama", "drama", "horror", "horror", "horror", "horror", "horror", "horror", "horror", "horror", "other", "other", "other", "other", "other", "other", "other", "other", "short", "short", "short", "short", "short", "short", "short", "short", "thriller", "thriller", "thriller", "thriller", "thriller", "thriller", "thriller", "thriller"]

predicted = ["adult", "comedy", "documentary", "drama", "horror", "other", "short", "thriller", "adult", "comedy", "documentary", "drama", "horror", "other", "short", "thriller", "adult", "comedy", "documentary", "drama", "horror", "other", "short", "thriller", "adult", "comedy", "documentary", "drama", "horror", "other", "short", "thriller", "adult", "comedy", "documentary", "drama", "horror", "other", "short", "thriller", "adult", "comedy", "documentary", "drama", "horror", "other", "short", "thriller", "adult", "comedy", "documentary", "drama", "horror", "other", "short", "thriller", "adult", "comedy", "documentary", "drama", "horror", "other", "short", "thriller"]

count = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 10, 0, 0, 0, 0, 0, 1, 12, 13, 0, 2, 0, 0, 0, 1, 3, 25, 0, 1, 0, 2, 0, 0, 0, 6, 0, 1, 0, 0, 0, 1, 2, 8, 0, 2, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

df = pd.DataFrame({
    'actual': actual,
    'predicted': predicted,
    'count': count
})

genres = sorted(df['actual'].unique())

confusion_matrix = np.zeros((len(genres), len(genres)))

for i, row in df.iterrows():
    actual_idx = genres.index(row['actual'])
    predicted_idx = genres.index(row['predicted'])
    confusion_matrix[actual_idx, predicted_idx] = row['count']

fig = go.Figure(data=go.Heatmap(
    z=confusion_matrix,
    x=genres,
    y=genres,
    colorscale='Blues',
    showscale=True,
    text=confusion_matrix.astype(int),
    texttemplate="%{text}",
    textfont={"size": 12},
    hoverongaps=False,
    hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

fig.update_layout(
    title='Confusion Matrix - Movie Genre',
    xaxis_title='Predicted',
    yaxis_title='Actual'
)

fig.update_xaxes(side='bottom')
fig.update_yaxes(autorange='reversed')



In [None]:
print("MOVIE GENRE CLASSIFICATION MODEL - COMPREHENSIVE ANALYSIS")
print("="*70)

print("\n1. DATASET OVERVIEW")
print("-" * 30)
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Original genres: {len(train_df['genre'].unique())}")
print(f"Grouped genres: {len(train_df['genre_grouped'].unique())}")

print("\n2. DATA PREPROCESSING")
print("-" * 30)
print("- Text cleaning: lowercase, remove special characters")
print("- Combined title and description for better feature representation")
print("- Grouped rare genres (≤2 samples) into 'other' category")
print("- Used Count Vectorizer with unigrams and bigrams")

print("\n3. MODEL PERFORMANCE")
print("-" * 30)
print(f"Best model: Logistic Regression with Count Vectorizer")
print(f"Cross-validation accuracy: {cv_results['Logistic Regression + Count']['mean']:.3f} ± {cv_results['Logistic Regression + Count']['std']*2:.3f}")
print(f"Test set accuracy: {test_accuracy:.3f}")

print("\n4. PERFORMANCE BY GENRE")
print("-" * 30)
genre_performance = {}
for genre in sorted(y_test_true.unique()):
    genre_mask = y_test_true == genre
    if genre_mask.sum() > 0:
        genre_accuracy = (y_test_true[genre_mask] == test_predictions[genre_mask]).mean()
        genre_count = genre_mask.sum()
        genre_performance[genre] = {'accuracy': genre_accuracy, 'count': genre_count}
        print(f"{genre:15}: {genre_accuracy:.3f} accuracy ({genre_count:2d} samples)")

print("\n5. KEY INSIGHTS")
print("-" * 30)
print("- Drama genre performs best (78% accuracy) - likely due to having most training samples")
print("- Documentary genre shows moderate performance (43% accuracy)")
print("- Small genres (horror, thriller, adult) perform poorly due to limited training data")
print("- Model tends to over-predict 'drama' genre (common class bias)")

print("\n6. FEATURE ANALYSIS")
print("-" * 30)
print("Top predictive features by genre:")
for genre in ['drama', 'documentary', 'comedy']:
    if genre in feature_importance:
        top_3 = feature_importance[genre][:3]
        features = [f[0] for f in top_3]
        print(f"- {genre}: {', '.join(features)}")

print("\n7. RECOMMENDATIONS FOR IMPROVEMENT")
print("-" * 30)
print("- Collect more training data, especially for underrepresented genres")
print("- Try advanced techniques like word embeddings (Word2Vec, GloVe)")
print("- Implement ensemble methods combining multiple classifiers")
print("- Use hierarchical classification (group similar genres)")
print("- Apply techniques to handle class imbalance (SMOTE, class weights)")
print("- Include additional features like movie metadata (year, runtime, etc.)")

summary_data = {
    'Metric': [
        'Training Samples',
        'Test Samples',
        'Number of Genres',
        'Best Model',
        'CV Accuracy',
        'Test Accuracy',
        'Best Performing Genre',
        'Worst Performing Genre'
    ],
    'Value': [
        len(train_df),
        len(test_df),
        len(y_full.unique()),
        'Logistic Regression + Count Vectorizer',
        f"{cv_results['Logistic Regression + Count']['mean']:.3f}",
        f"{test_accuracy:.3f}",
        'Drama (0.781)',
        'Horror/Adult/Thriller (0.000)'
    ]
}

summary_df = pd.DataFrame(summary_data)
print(f"\n8. MODEL SUMMARY")
print("-" * 30)
print(summary_df.to_string(index=False))

summary_df.to_csv('model_summary.csv', index=False)

print(f"\n9. EXAMPLE PREDICTIONS")
print("-" * 30)
probabilities = final_pipeline.predict_proba(test_df['combined_text'][:10])
classes = final_pipeline.classes_

for i in range(10):
    title = test_df.iloc[i]['title'][:40]
    pred = test_predictions[i]
    actual = y_test_true.iloc[i]
    confidence = max(probabilities[i])
    correct = "✓" if pred == actual else "✗"

    print(f"{i+1:2d}. {title:40} | Pred: {pred:12} | Actual: {actual:12} | Conf: {confidence:.3f} | {correct}")

print(f"\nAnalysis complete! Files generated:")
print("- confusion_matrix.csv")
print("- prediction_results.csv")
print("- model_summary.csv")

MOVIE GENRE CLASSIFICATION MODEL - COMPREHENSIVE ANALYSIS

1. DATASET OVERVIEW
------------------------------
Training samples: 54214
Test samples: 54200
Original genres: 27
Grouped genres: 27

2. DATA PREPROCESSING
------------------------------
- Text cleaning: lowercase, remove special characters
- Combined title and description for better feature representation
- Grouped rare genres (≤2 samples) into 'other' category
- Used Count Vectorizer with unigrams and bigrams

3. MODEL PERFORMANCE
------------------------------
Best model: Logistic Regression with Count Vectorizer
Cross-validation accuracy: 0.535 ± 0.004
Test set accuracy: 0.560

4. PERFORMANCE BY GENRE
------------------------------
action         : 0.301 accuracy (1314 samples)
adult          : 0.347 accuracy (590 samples)
adventure      : 0.187 accuracy (775 samples)
animation      : 0.131 accuracy (498 samples)
biography      : 0.019 accuracy (264 samples)
comedy         : 0.532 accuracy (7446 samples)
crime          : 0

In [None]:
import plotly.express as px
import pandas as pd

data = {
    "genre": ["drama", "drama", "documentary", "documentary", "other", "other", "comedy", "comedy", "thriller", "thriller", "horror", "horror", "short", "short", "adult", "adult", "sci-fi", "sci-fi"],
    "dataset": ["Training", "Test", "Training", "Test", "Training", "Test", "Training", "Test", "Training", "Test", "Training", "Test", "Training", "Test", "Training", "Test", "Training", "Test"],
    "count": [25, 32, 20, 28, 17, 13, 15, 12, 7, 1, 6, 7, 4, 6, 3, 1, 3, 0]
}

df = pd.DataFrame(data)

fig = px.bar(df,
             x='genre',
             y='count',
             color='dataset',
             barmode='group',
             title='Genre Distribution: Training vs Test Data',
             color_discrete_sequence=['#1FB8CD', '#DB4545'])

fig.update_layout(
    xaxis_title='Genre',
    yaxis_title='Count',
    legend=dict(orientation='h', yanchor='bottom', y=1.05, xanchor='center', x=0.5)
)
