In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTETomek
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns




In [None]:
data_path = '/kaggle/input/the-movie-repo-database/data4.csv' 

# Load your dataset
data = pd.read_csv(data_path)

In [None]:
data = data.dropna()


In [None]:
verdict_counts = data['Verdict'].value_counts()

# Print the counts
print(verdict_counts)

In [None]:
data['Year'] = data['Year'].astype(int)
data['Duration'] = data['Duration'].astype(int)

In [None]:
features = ['Title','Year', 'Rate', 'Duration', 'Genre1', 'Genre2', 'Genre3', 'Director', 'Writer' , 'Actor1', 'Actor2', 'Language', 'Description' ]
X = data[features]
y = data['Verdict']

In [None]:
label_mapping = {
    'Disaster': 0,
    'Flop': 1,
    'Successful': 2,
    'Average': 3,
    'Hit': 4,
    'Outstanding': 5,
    'Superhit': 6,
    'Blockbuster': 7
}


# Replace the categorical labels with their numeric equivalents
y_encoded = y.replace(label_mapping)

# Check the result
print(y_encoded)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', ['Year', 'Duration']),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['Rate', 'Genre1', 'Genre2', 'Genre3']),
        ('title', TfidfVectorizer(), 'Title'),
        ('description', TfidfVectorizer(), 'Description'),
        ('director', TfidfVectorizer(), 'Director'),
        ('writer', TfidfVectorizer(), 'Writer'),
        ('actors', TfidfVectorizer(), 'Actor1'),
        ('actors2', TfidfVectorizer(), 'Actor2'),
        ('language', TfidfVectorizer(), 'Language')
    ]
)


In [None]:
def test_multiple_models(X, y, preprocessor):
    models = [
        ('Logistic Regression', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')),
        ('Support Vector Machine', SVC(random_state=42, class_weight='balanced')),
        ('Decision Tree', DecisionTreeClassifier(random_state=42, class_weight='balanced')),
        ('Random Forest', RandomForestClassifier(random_state=42, class_weight='balanced')),
        ('XGBoost', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('LightGBM', LGBMClassifier(random_state=42, class_weight='balanced'))
    ]

    # Train-test split with encoded labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Preprocess the data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # Combine SMOTE and Tomek links (for oversampling and undersampling)
    smote_tomek = SMOTETomek(random_state=42)

    # Apply resampling to the preprocessed training data
    X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train_preprocessed, y_train)

    for model_name, model in models:
        print(f"Training {model_name}...")

        # Train the model
        model.fit(X_train_resampled, y_train_resampled)

        # Predict and evaluate
        y_pred = model.predict(X_test_preprocessed)
        print(f"Classification report for {model_name}:\n")
        print(classification_report(y_test, y_pred, zero_division=0))
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print(f"Confusion Matrix for {model_name}:\n{cm}\n")
        
        # Plot confusion matrix
        plt.figure(figsize=(10, 7))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix for {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
        
        print("-" * 50)

# Run the function with encoded target labels
test_multiple_models(X, y_encoded, preprocessor)

In [None]:
def test_multiple_models(X, y, preprocessor):
    models = [
        ('Logistic Regression', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')),
        ('Support Vector Machine', SVC(random_state=42, class_weight='balanced')),
        ('Decision Tree', DecisionTreeClassifier(random_state=42, class_weight='balanced')),
        ('Random Forest', RandomForestClassifier(random_state=42, class_weight='balanced')),
        ('XGBoost', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('LightGBM', LGBMClassifier(random_state=42, class_weight='balanced'))
    ]

    # Train-test split with encoded labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Preprocess the data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # Combine SMOTE and Tomek links (for oversampling and undersampling)
    smote_tomek = SMOTETomek(random_state=42)

    # Apply resampling to the preprocessed training data
    X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train_preprocessed, y_train)

    for model_name, model in models:
        print(f"Training {model_name}...")

        # Train the model
        model.fit(X_train_resampled, y_train_resampled)

        # Predict and evaluate
        y_pred = model.predict(X_test_preprocessed)
        print(f"Classification report for {model_name}:\n")
        print(classification_report(y_test, y_pred, zero_division=0))
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print(f"Confusion Matrix for {model_name}:\n{cm}\n")
        
        # Plot confusion matrix
        plt.figure(figsize=(10, 7))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix for {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
        
        print("-" * 50)

# Run the function with encoded target labels
test_multiple_models(X, y_encoded, preprocessor)

In [None]:
def test_multiple_models(X, y, preprocessor):
    models = [
        ('Logistic Regression', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')),
        ('Support Vector Machine', SVC(random_state=42, class_weight='balanced')),
        ('Decision Tree', DecisionTreeClassifier(random_state=42, class_weight='balanced')),
        ('Random Forest', RandomForestClassifier(random_state=42, class_weight='balanced')),
        ('XGBoost', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
        ('LightGBM', LGBMClassifier(random_state=42, class_weight='balanced'))
    ]

    # Train-test split with encoded labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Preprocess the data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # Combine SMOTE and Tomek links (for oversampling and undersampling)
    smote_tomek = SMOTETomek(random_state=42)

    # Apply resampling to the preprocessed training data
    X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train_preprocessed, y_train)

    for model_name, model in models:
        print(f"Training {model_name}...")

        # Train the model
        model.fit(X_train_resampled, y_train_resampled)

        # Predict and evaluate
        y_pred = model.predict(X_test_preprocessed)
        print(f"Classification report for {model_name}:\n")
        print(classification_report(y_test, y_pred, zero_division=0))
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print(f"Confusion Matrix for {model_name}:\n{cm}\n")
        
        # Plot confusion matrix
        plt.figure(figsize=(10, 7))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix for {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
        
        print("-" * 50)

# Run the function with encoded target labels
test_multiple_models(X, y_encoded, preprocessor)