In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
#pd.set_option('display.max_columns',None)

movies = pd.read_csv('IMDb Movies India.csv',encoding='ISO-8859-1')

#remove entries with 'Rating' as null.
#Because target variable cannot be null.
movies = movies.dropna(subset=['Rating'])
movies.drop_duplicates(inplace=True)

movies.fillna({
    'Year': 'Unknown',
    'Duration': '0 min',
    'Genre': 'Unknown',
    'Votes': 0,
    'Director': 'Unknown',
    'Actor 1': 'Unknown',
    'Actor 2': 'Unknown',
    'Actor 3': 'Unknown'
}, inplace=True)
movies = movies.reset_index(drop=True)
#remove () from Year and Duration and make it as int.
movies['Duration'] = movies['Duration'].str.extract('(\d+)').astype(int)
movies['Year'] = movies['Year'].str.extract('(\d{4})').astype(int)
movies['Votes'] = movies['Votes'].str.replace(',', '').astype(int)

#Data Analysis
DAmovies  = movies.copy()
DAmovies['Genre'] = DAmovies['Genre'].str.split(', ')
DAmovies = DAmovies.explode('Genre').reset_index(drop=True)

In [2]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
movies['Genre']=le.fit_transform(movies['Genre'])
movies['Director']=le.fit_transform(movies['Director'])
movies['Actor 1']=le.fit_transform(movies['Actor 1'])
movies['Actor 2']=le.fit_transform(movies['Actor 2'])
movies['Actor 3']=le.fit_transform(movies['Actor 3'])

scaler = StandardScaler()
movies[['Votes', 'Duration']] = scaler.fit_transform(movies[['Votes', 'Duration']])

# Train-test split
X = movies.drop(columns=['Rating', 'Name'])
y = movies['Rating']

# PolynomialFeatures is used to generate new features by adding polynomial combinations of the original features up to a specified degree.
# This can be useful for polynomial regression or when you want to capture non-linear relationships between variables in your dataset.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=42)

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize the models using user defined Function
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Support Vector Regressor": SVR(),
    "XGBoost Regressor": XGBRegressor()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R²": r2_score(y_test, y_pred)
    }

for name, metrics in results.items():
    print(f"Model: {name}")
    print(f"  MAE: {metrics['MAE']}")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  R²: {metrics['R²']}\n")    

Model: Linear Regression
  MAE: 0.9910047410989525
  MSE: 1.5810085647789218
  R²: 0.1695072828225086

Model: Random Forest Regressor
  MAE: 0.8561127946127945
  MSE: 1.286848148989899
  R²: 0.3240276873522888

Model: Gradient Boosting Regressor
  MAE: 0.8442401861605973
  MSE: 1.2277778163060056
  R²: 0.35505691906431613

Model: Support Vector Regressor
  MAE: 1.0721099225045099
  MSE: 1.8280665213941718
  R²: 0.039729469937346384

Model: XGBoost Regressor
  MAE: 0.8807749802215331
  MSE: 1.3592095793180983
  R²: 0.2860167352101546

