# IMDb Rating Prediction from a data set of Movies

## Github Link of this Project can be found at 
### https://github.com/diptaraj23/IMDb-Rating-Prediction-

In [None]:
#Import the Libraries
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import missingno as msno 
import seaborn as sns
plt.style.use('seaborn-whitegrid')

from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [None]:
#Load the Dataset
df_movies = pd.read_csv("../input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv", index_col="ID")
df_movies.head()

In [None]:
#Remove "Unnamed" 
df_movies=df_movies.drop("Unnamed: 0",axis=1)
df_movies.head()

In [None]:
#Description overview of the data
df_movies.info()

In [None]:
#Check for total no.of null values in each column 
df_movies.isna().sum()

In [None]:
#Removing the Target value
df_movies = df_movies[df_movies['IMDb'].notna()]

In [None]:
#Visualizing the amount of missing data
msno.bar(df_movies ,color='red', figsize=(10, 4))

In [None]:
#Dropping "Rotten Tomatoes" for its excessively low data 
df_movies = df_movies.drop(['Rotten Tomatoes'], axis=1)

#"Title" and "Type" are dropped as it is irrelevant for our prediction.
df_movies = df_movies.drop(['Title','Type'], axis=1)
df_movies.isna().sum()

In [None]:
#Checking unique values in "Age" 
df_movies['Age'].unique()

In [None]:
#Imputing null values in "Age" with "all"
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value="all")
df_movies["Age"] = imputer.fit_transform(df_movies[["Age"]]).ravel()

df_movies.isna().sum()

In [None]:
#Dropping all the rows(entries) where there are celss with no data 
df_movies.dropna(axis=0, how='any',inplace=True)

df_movies.isna().sum()

In [None]:
#Visualizing if there is anymore missing data or not
msno.bar(df_movies ,color='red', figsize=(10, 4))

In [None]:
df_movies.info()

In [None]:
# Choose target and features
y = df_movies.IMDb

X = df_movies.drop(['IMDb'], axis=1)

#Split the data for train and test
X_train_full, X_test_full, y_train, y_test  = train_test_split(X, y,random_state = 0)

#List of Categorical colunmns to be used as features
cat_cols=["Age","Directors","Genres","Country","Language"]

#List of Numerical colunmns to be used as features
numerical_cols = ['Year','Runtime']

#Keep selected columns only
my_cols = numerical_cols + cat_cols 
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [None]:
#Viewing Test data
X_train.head()

In [None]:
#Viewing Test data
X_test.head()

In [None]:
#Copying the data to prevent change in original datset
label_X_train = X_train.copy()
label_X_test = X_test.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in cat_cols:
    label_encoder.fit(pd.concat([label_X_train[col], label_X_test[col]], axis=0, sort=False))
    label_X_train[col] = label_encoder.transform(label_X_train[col])
    label_X_test[col] = label_encoder.transform(label_X_test[col])

In [None]:
# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=1)
model_2 = RandomForestRegressor(n_estimators=100, random_state=1)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=1)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=1)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=1)

#List of models
models = [model_1, model_2, model_3, model_4, model_5]

In [None]:
# Function for comparing different models
def score_model(model, X_t=label_X_train, X_v=label_X_test, y_t=y_train, y_v=y_test):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

mae_scores=[]

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %f" % (i+1, mae))
    mae_scores.append(mae)

In [None]:
best_score=min(mae_scores)
best_score