In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import data


In [None]:
df=pd.read_csv('/kaggle/input/imdb-5000-movie-dataset/movie_metadata.csv')

# Exploring the dataset 

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

# Identifying null values 

In [None]:
df.isna().any()

### Majority of the data contains null values which needs to be removed  or imputed

In [None]:
df.isna().sum()

### IMDB link is a index variable which is not required for the analysis

In [None]:
# drop IMDB link
df.drop('movie_imdb_link',axis=1,inplace=True)

#### Remove the null values row vise and check for the loss of data 

In [None]:
df.dropna(axis=0,subset=['director_name', 'num_critic_for_reviews','duration','actor_3_facebook_likes','actor_2_name','actor_1_facebook_likes','actor_1_name','actor_3_name','facenumber_in_poster','num_user_for_reviews','language','country','actor_2_facebook_likes','plot_keywords'],inplace=True)

In [None]:
df.shape

In [None]:
df.isna().sum()

#### Investigating genre and director name

In [None]:
df['genres'].value_counts().unique().sum()

###### It is clear that genre contains 1850 unique categories and thus needs to further investigated

In [None]:
df['genres'].head()

###### It is clear that some genres are attached together for movies. This is mainly because movies wants to cater a diverse set of individuals thus opts to include many genres for the movie rather than going for a single theme.

##### We need to split the genres first and then apply one hot encoding to get a sparse matrix for each genre 

In [None]:
df['genres'] = df['genres'].str.split('|') #split the genres
genres = df['genres'].str.join('|').str.get_dummies()# getting dummies
df = pd.concat([df, genres], axis=1) # connecting back the genres

In [None]:
df.columns

In [None]:
df.drop(['genres'],axis=1,inplace=True) #dropping genres variable

In [None]:
df.columns

In [None]:
df['director_name'].value_counts().unique().sum()

###### Since there are a lot of unique values we need to drop this variable

In [None]:
df.drop(['director_name'],axis=1,inplace=True)

In [None]:
df['content_rating'].unique()

In [None]:
df['content_rating'].isna().sum()


In [None]:
df['content_rating'].value_counts()

In [None]:
#since majority is R in content rating we impute this
df['content_rating'].fillna('R', inplace = True) 


In [None]:

df = pd.get_dummies(data = df, columns = ['content_rating'] , prefix = ['content_rating'] , drop_first = True)


In [None]:
df.columns

### Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df.drop(['gross'],axis=1),df['gross'],random_state=1234)

In [None]:
X_train.shape,X_test.shape

## Exploratory data analysis

In [None]:
X_train.isna().sum()

## Budget variable

In [None]:
plt.figure(figsize=(5,5))
sns.distplot(X_train['budget'])
plt.xticks(rotation=90)
plt.show()

It apears that the budget variable contains a lot of peaks and contains a high standard deviation thus we need to check for a summary of the distribution

In [None]:
X_train['budget'].describe()

Best way to impute the missing values in this scenerio is by the use of the median

In [None]:
import statistics

In [None]:
statistics.median(X_train['budget'])

In [None]:
#impute the median to the budet as it is skewed
X_train['budget'].fillna(85000000.0, inplace = True) 
X_test['budget'].fillna(85000000.0, inplace = True)

## Aspect ratio

In [None]:
sns.distplot(X_train['aspect_ratio'])

Aspect ratio appears to be bimodal therefore lets impute the median for this as well as a skewness is illustarated on top of the two modes in the distribution

In [None]:
statistics.median(X_train['aspect_ratio'])

In [None]:
#impute the median since the data is skewed
X_train['aspect_ratio'].fillna(1.85, inplace = True) 
X_test['aspect_ratio'].fillna(1.85, inplace = True)

## Gross (Target variable)

In [None]:
sns.distplot(y_train)

Gross illustrates a skewed distribution thus the median is needed to be imputed

In [None]:
statistics.median(y_train)

In [None]:
#since the gross income is skewed will impute the median for NA values
y_train.fillna(72962455.0, inplace = True) 
y_test.fillna(72962455.0, inplace = True)

In [None]:
X_train.isna().sum()

## Color 

In [None]:
sns.countplot(X_train['color'])

In [None]:
#since majority of the color variable is Color will impute that element
X_train['color'].fillna('Color', inplace = True) 
X_test['color'].fillna('Color', inplace = True)

In [None]:
X_train.isna().sum()

It appears no more NA values is available

## Getting dummies for the categorical variables

In [None]:
X_train= pd.get_dummies(data = X_train, columns = ['color'] , prefix = ['color'] , drop_first = True)

In [None]:
X_test= pd.get_dummies(data = X_test, columns = ['color'] , prefix = ['color'] , drop_first = True)

In [None]:
X_train.columns

In [None]:
#movie title is a categorical variable with a lot of categories
X_train.drop(['movie_title'],axis=1,inplace=True)


In [None]:
X_test.drop(['movie_title'],axis=1,inplace=True)

Movie title was dropped as it only acts as an index variable

In [None]:
X_train.columns

In [None]:
X_test.columns

In [None]:
#plot keywords has alot of unique keywords therefore it was dropped
X_train.drop(['plot_keywords'],axis=1,inplace=True)
X_test.drop(['plot_keywords'],axis=1,inplace=True)

## Country

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(X_train['country'])
plt.xticks(rotation=90)
plt.show()

Most movies appears to be made in the USA followed by the UK. Other countries seems to provide less contribution to the dataset. Therefore in turn we can state that there is a bias towards US based data.
Lets recode the country variable such that it only represents USA,UK and other.

In [None]:
# country is recoded to three categories as USA, UK and other
value_count=X_train["country"].value_counts()
vals = value_count[:2].index
print (vals)

X_train['country'] = X_train.country.where(X_train.country.isin(vals), 'other')

In [None]:
value_count=X_test["country"].value_counts()
vals = value_count[:2].index
print (vals)

X_test['country'] = X_test.country.where(X_test.country.isin(vals), 'other')

In [None]:
#Getting dummies for country
X_train= pd.get_dummies(data = X_train, columns = ['country'] , prefix = ['country'] , drop_first = True)
X_test= pd.get_dummies(data = X_test, columns = ['country'] , prefix = ['country'] , drop_first = True)



## Language

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(X_train['language'])
plt.xticks(rotation=90)

From the above plot it is clear that most mmovies are infact English. Hindi movies which is part of the Bollywood industry being the largest industry at presents lacks substantial representation.
Therefore we can recode the country variable as English and other.

In [None]:
#The language is divided to English and other
count_lanuage=X_train["language"].value_counts()
count_lanuage1=X_test["language"].value_counts()

In [None]:
vals1 = count_lanuage[:1].index

vals2 = count_lanuage1[:1].index

X_train['language'] = X_train.language.where(X_train.language.isin(vals1), 'other')
X_test['language'] = X_test.language.where(X_test.language.isin(vals2), 'other')


In [None]:
X_test['language'].value_counts(),X_train['language'].value_counts()

In [None]:
#getting dummies for language
X_train = pd.get_dummies(data = X_train, columns = ['language'] , prefix = ['language'] , drop_first = True)
X_test = pd.get_dummies(data = X_test, columns = ['language'] , prefix = ['language'] , drop_first = True)

In [None]:
X_train.columns

Dropping actor names as they have too much of unique values 

In [None]:
#dropping categorical variables actor 1 name, actor 2 name, actor 3 name
X_train.drop(['actor_1_name','actor_2_name','actor_3_name'],axis=1,inplace=True)


In [None]:
X_test.drop(['actor_1_name','actor_2_name','actor_3_name'],axis=1,inplace=True)

In [None]:
X_test.shape,X_train.shape

In [None]:
y_test.shape,y_train.shape

# Advance analysis 

### Importing standard scaler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
stdscaler=StandardScaler()

In [None]:
X_std=stdscaler.fit(X_train)
X_std=stdscaler.transform(X_train)
X_std_test=stdscaler.transform(X_test)

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
#Fitting the random forest model to our train data
rtree=RandomForestRegressor(random_state=0).fit(X_train,y_train)

In [None]:
# Getting the R squared
rtree.score(X_test,y_test)

The accuracy needs to be improved

In [None]:
# checking if the data has overfitted
rtree.score(X_train,y_train)

##### It appears that the data has overfitted therefore we need to tune our parameters

### Lets fit the standardized data as well first

In [None]:
rtree1=RandomForestRegressor(random_state=0).fit(X_std,y_train)

In [None]:
rtree1.score(X_std_test,y_test)

#### The scaling has not helped as random forest does not require us to do feature scaling

### Lets identify the most importamt features first

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
perm_importance = permutation_importance(rtree, X_test, y_test)

sorted_idx = perm_importance.importances_mean.argsort()[::-1]
plt.figure(figsize=(5,10))
sns.barplot(y=X_test.columns[sorted_idx], x=perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

##### It is clear thatnumber of voted users are the most important feature


### Lets do a random grid search to find the best parameters for the set of data 

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Use the random grid to search for best hyperparameters
# Creating the base model to tune the parameters
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 99 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 33, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
#identifying the best parameters
rf_random.best_params_


In [None]:
#Fitting the data
rtree3=RandomForestRegressor(max_features='auto',random_state=0,min_samples_split=2,n_estimators= 2000,min_samples_leaf= 1,max_depth= 20, bootstrap= True).fit(X_train,y_train)

In [None]:
rtree3.score(X_test,y_test)

## Gradient boost regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
grad=GradientBoostingRegressor(random_state=0,learning_rate=0.15).fit(X_train,y_train)

In [None]:
grad.score(X_test,y_test),grad.score(X_train,y_train)

##### This appears to have a higher accuracy and has not over fitted the data


##### But before we tune lets check the accuracy we obtain by the use of xtreme gradient boost as well 

In [None]:
from xgboost import XGBRegressor

In [None]:
xboost=XGBRegressor().fit(X_train,y_train)

In [None]:
xboost.score(X_train,y_train),xboost.score(X_test,y_test)

### grid search has to be applied for gradient boost as it has better prediction than extreme gradient boost

In [None]:
# Number of trees in gradient boost
n_estimators = [100,250,500,750,1000,1250,1500,1750]
# Number of features to consider at every split
max_features = ['sqrt','auto','log2']
#learning rate
learning_rate=[0.15,0.1,0.05,0.01,0.005,0.001]
#maxdepth
max_depths = [2,3,4,5,6,7]
#min sample splits
min_samples_splits = [2,4,6,8,10,20,40,60,100]
#minleaf
min_samples_leafs = [1,3,5,7,9]
#subsamples
subsample=[0.7,0.75,0.8,0.85,0.9,0.95,1]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depths,
               'min_samples_split': min_samples_splits,
               'min_samples_leaf': min_samples_leafs,
               'learning_rate':learning_rate,
              'subsample':subsample}

In [None]:
# Use the random grid to search for best hyperparameters
#Creating the base model to tune
gdr = GradientBoostingRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
gdr_random = RandomizedSearchCV(estimator = gdr, param_distributions = random_grid, n_iter = 33, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model to execute the search

gdr_random.fit(X_train, y_train)

In [None]:
gdr_random.best_estimator_

In [None]:
grad1=GradientBoostingRegressor(learning_rate=0.01, max_depth=7, max_features='sqrt',
                          min_samples_leaf=5, min_samples_split=20,
                          n_estimators=1250, subsample=0.7,random_state=0).fit(X_train,y_train)

In [None]:
grad1.score(X_test,y_test),grad1.score(X_train,y_train)

## Voting regressor 

In [None]:
from sklearn.ensemble import VotingRegressor

In [None]:
er = VotingRegressor([('rf', rtree3),('gb',grad1)]).fit(X_train,y_train)

In [None]:
er.score(X_test,y_test),er.score(X_train,y_train)