In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
#pd.set_option('display.max_columns',None)

movies = pd.read_csv('IMDb Movies India.csv',encoding='ISO-8859-1')

#remove entries with 'Rating' as null.
#Because target variable cannot be null.
movies = movies.dropna(subset=['Rating'])
movies.drop_duplicates(inplace=True)

movies.fillna({
    'Year': 'Unknown',
    'Duration': '0 min',
    'Genre': 'Unknown',
    'Votes': 0,
    'Director': 'Unknown',
    'Actor 1': 'Unknown',
    'Actor 2': 'Unknown',
    'Actor 3': 'Unknown'
}, inplace=True)
movies = movies.reset_index(drop=True)
#remove () from Year and Duration and make it as int.
movies['Duration'] = movies['Duration'].str.extract('(\d+)').astype(int)
movies['Year'] = movies['Year'].str.extract('(\d{4})').astype(int)
movies['Votes'] = movies['Votes'].str.replace(',', '').astype(int)

#Data Analysis
DAmovies  = movies.copy()
DAmovies['Genre'] = DAmovies['Genre'].str.split(', ')
DAmovies = DAmovies.explode('Genre').reset_index(drop=True)
#DAmovies.to_csv("/home/mis/Sukhesh/movie_rating_da.csv",index=False)
#creating separate column for each genre
movies['Genre'] = movies['Genre'].str.split(', ')
df_genres = movies['Genre'].str.join('|').str.get_dummies()
movies = pd.concat([movies, df_genres], axis=1)
movies = movies.drop(columns=['Genre'])


In [2]:
scaler = StandardScaler()
movies[['Votes', 'Duration']] = scaler.fit_transform(movies[['Votes', 'Duration']])

#one-hot Encoding
movies_updated = pd.get_dummies(movies, columns=['Director', 'Actor 1', 'Actor 2', 'Actor 3'], drop_first=True)
movies_updated.head()

# Train-test split
X = movies_updated.drop(columns=['Rating', 'Name'])
y = movies_updated['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Initialize the models using user defined Function
models = {
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Support Vector Regressor": SVR(),
    "XGBoost Regressor": XGBRegressor()
}

results = {}
for name, model in models.items():
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            "MAE": mean_absolute_error(y_test, y_pred),
            "MSE": mean_squared_error(y_test, y_pred),
            "R²": r2_score(y_test, y_pred)
        }
    except:
        pass
for name, metrics in results.items():
    print(f"Model: {name}")
    print(f"  MAE: {metrics['MAE']}")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  R²: {metrics['R²']}\n")    

Model: Decision Tree Regressor
  MAE: 1.0160101010101008
  MSE: 1.870530303030303
  R²: 0.024868551516088355

Model: Random Forest Regressor
  MAE: 0.8026782828282828
  MSE: 1.1749431166666666
  R²: 0.3874870771217829

Model: Gradient Boosting Regressor
  MAE: 0.8539842692085623
  MSE: 1.2478530213845314
  R²: 0.34947820825653275

Model: Support Vector Regressor
  MAE: 1.0952599074873703
  MSE: 1.8763528938204406
  R²: 0.02183316022522419

Model: XGBoost Regressor
  MAE: 0.7962368691328799
  MSE: 1.1250841295471112
  R²: 0.41347920686759276

