In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import os

# Path to the dataset
file_path = '/kaggle/input/imdb-india-movies/IMDb Movies India.csv'

# Load the dataset with appropriate encoding
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Data Cleaning
df = df.dropna(subset=['Rating'])
df['Votes'] = df['Votes'].str.replace(',', '').astype(float)
df['Year'] = df['Year'].str.extract(r'(\d{4})').astype(float)
df['Duration'] = df['Duration'].str.replace(' min', '').astype(float)

# Feature Engineering
df['21st_Century'] = (df['Year'] >= 2000).astype(int)

# Fill missing values for numerical columns
imputer = SimpleImputer(strategy='median')
df[['Year', 'Duration', 'Votes']] = imputer.fit_transform(df[['Year', 'Duration', 'Votes']])

# Drop rows with missing values in key categorical columns
df = df.dropna(subset=['Genre', 'Director', 'Actor 1'])

# One-hot encode 'Genre'
df = pd.concat([df, pd.get_dummies(df['Genre'], prefix='Genre')], axis=1)

# Label encode categorical columns
for col in ['Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    df[col] = LabelEncoder().fit_transform(df[col])

# Prepare features and target
X = df.drop(['Rating', 'Name', 'Genre'], axis=1)
y = df['Rating']

# Handle any remaining NaN values
X = X.fillna(X.median())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
rf = RandomForestRegressor(random_state=42)
gbr = GradientBoostingRegressor(random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

param_grid_gbr = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 10]
}

# Grid search for RandomForest
grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_rf.fit(X_train, y_train)

# Grid search for GradientBoosting
grid_gbr = GridSearchCV(estimator=gbr, param_grid=param_grid_gbr, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_gbr.fit(X_train, y_train)

# Evaluate both models
best_rf = grid_rf.best_estimator_
best_gbr = grid_gbr.best_estimator_

# Predictions
y_pred_rf = best_rf.predict(X_test)
y_pred_gbr = best_gbr.predict(X_test)

# Evaluation Metrics
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)

rmse_gbr = mean_squared_error(y_test, y_pred_gbr, squared=False)
r2_gbr = r2_score(y_test, y_pred_gbr)

print("Random Forest RMSE:", rmse_rf)
print("Random Forest R^2:", r2_rf)
print("Gradient Boosting RMSE:", rmse_gbr)
print("Gradient Boosting R^2:", r2_gbr)

# Feature importance for RandomForest
top_features_rf = X.columns[np.argsort(best_rf.feature_importances_)[::-1]]
print("Top Features (Random Forest):", top_features_rf[:10])

# Cross-validation scores
cv_scores_rf = cross_val_score(best_rf, X, y, cv=5, scoring='neg_mean_squared_error')
cv_scores_gbr = cross_val_score(best_gbr, X, y, cv=5, scoring='neg_mean_squared_error')

print("Random Forest CV RMSE:", np.sqrt(-cv_scores_rf).mean())
print("Gradient Boosting CV RMSE:", np.sqrt(-cv_scores_gbr).mean())


Random Forest RMSE: 1.1373271888601988
Random Forest R^2: 0.3297408446931084
Gradient Boosting RMSE: 1.1334905762235683
Gradient Boosting R^2: 0.3342552666333356
Top Features (Random Forest): Index(['Votes', 'Year', 'Duration', 'Actor 1', 'Actor 3', 'Actor 2',
       'Director', 'Genre_Drama', 'Genre_Documentary', 'Genre_Action'],
      dtype='object')



KeyboardInterrupt

