# About the dataset

Let us start the project by understanding our data.

The dataset is from Kaggle website. It contains 28 variables for 5043 movies, spanning across 100 years in 66 countries. There are 2399 unique director names, and thousands of actors/actresses. “imdb_score” is the response variable while the other 27 variables are possible predictors.

I have copied this code from "Statistical Approach for Predicting IMDB"

Link: https://www.kaggle.com/bharathraja/statistical-approach-for-predicting-imdb

The author has explained everything very syestematically

# Problem Statement

Based on the massive movie information, it would be interesting to understand what are the important factors that make a movie more successful than others. So, we would like to analyze what kind of movies are more successful, in other words, get higher IMDB score. We also want to show the results of this analysis in an intuitive way by visualizing outcome using Python.

In this project, we take IMDB scores as response variable and focus on operating predictions by analyzing the rest of variables in the IMDB 5000 movie data. The results can help film companies to understand the secret of generating a commercial success movie.





In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

In [None]:
#importing the dataset
dataset = pd.read_csv('../input/imdb-5000-movie-dataset/movie_metadata.csv')
dataset.head()

In [None]:
dataset.columns

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.shape

In [None]:
dataset.drop_duplicates(inplace=True)
dataset.shape

In [None]:
numerical_cols = [col for col in dataset.columns if dataset[col].dtype != 'object']
categorical_cols = [col for col in dataset.columns if dataset[col].dtype == 'object']

In [None]:
dataset.describe().T

Now let us start by filling in the NA values wherever we can!

In [None]:
color_mode = dataset['color'].mode().iloc[0]
dataset.color.fillna(color_mode, inplace = True)
dataset.color.isnull().sum()

In [None]:
dataset = dataset.dropna(axis = 0, subset = ['director_name'] )


In [None]:
dataset.num_critic_for_reviews.min(), dataset.num_critic_for_reviews.max(), dataset.num_critic_for_reviews.median()

In [None]:
num_critic_for_reviews_median = dataset['num_critic_for_reviews'].median()
dataset.num_critic_for_reviews.fillna(num_critic_for_reviews_median, inplace = True)
dataset.num_critic_for_reviews.isnull().sum()

In [None]:
duration_median = dataset.duration.median()
dataset.duration.fillna(duration_median, inplace=True)
dataset.duration.isnull().sum()

In [None]:
director_facebook_likes_mean = dataset.director_facebook_likes.mean()
dataset.director_facebook_likes.fillna(director_facebook_likes_mean, inplace = True)
dataset.director_facebook_likes.isnull().sum()

In [None]:
dataset.actor_3_facebook_likes.min(), dataset.actor_3_facebook_likes.max(), dataset.actor_3_facebook_likes.median(),dataset.actor_3_facebook_likes.mean()

In [None]:
actor_3_facebook_likes_mean = dataset.actor_3_facebook_likes.mean()
dataset.actor_3_facebook_likes.fillna(actor_3_facebook_likes_mean, inplace = True)
dataset.actor_3_facebook_likes.isnull().sum()

In [None]:
dataset = dataset.dropna(axis = 0, subset = ['actor_2_name'])
dataset.actor_2_name.isnull().sum()

In [None]:
actor_1_facebook_likes_mean = dataset.actor_1_facebook_likes.mean()
dataset.actor_1_facebook_likes.fillna(actor_1_facebook_likes_mean, inplace = True)
dataset.actor_1_facebook_likes.isnull().sum()

In [None]:
dataset = dataset.dropna(axis = 0, subset = ['gross'])
dataset.gross.isnull().sum()

In [None]:
dataset.shape
dataset.isnull().sum()

In [None]:
dataset = dataset.dropna(axis = 0, subset = ['budget'])
dataset.budget.isnull().sum()

In [None]:
dataset = dataset.dropna(axis = 0, subset = ['actor_3_name'])
dataset.actor_3_name.isnull().sum()

In [None]:
facenumber_in_poster_median = dataset.facenumber_in_poster.median()
dataset.facenumber_in_poster.fillna(facenumber_in_poster_median, inplace = True)
dataset.facenumber_in_poster.isnull().sum()

In [None]:
language_mode = dataset.language.mode().iloc[0]
dataset.language.fillna(language_mode, inplace = True)
dataset.language.isnull().sum()

In [None]:
dataset = dataset.dropna(axis = 0, subset = ['plot_keywords'])
dataset.plot_keywords.isnull().sum()

In [None]:
dataset.content_rating.unique()

In [None]:
dataset.content_rating.fillna('Not Rated', inplace = True)

In [None]:
dataset.aspect_ratio.unique()

In [None]:
aspect_ratio_mode = dataset.aspect_ratio.mode().iloc[0]
dataset.aspect_ratio.fillna(aspect_ratio_mode, inplace = True)  

In [None]:
dataset.isnull().sum()

In [None]:
dataset.color.unique(), dataset.color.nunique()

In [None]:
dataset['color'] = dataset.color.map({'Color' : 1 , ' Black and White' : 0})

In [None]:
dataset.director_name.unique(), dataset.director_name.nunique()

In [None]:
director_name_value_counts = dataset.director_name.value_counts()
director_name_value_counts  = pd.DataFrame(director_name_value_counts).reset_index().rename(columns = {'index': 'director_name', 'director_name':'director_name_value_counts'})

In [None]:
dataset = pd.merge(dataset, director_name_value_counts,left_on = 'director_name', right_on = 'director_name', how = 'left')

In [None]:
dataset = dataset.drop(columns = 'director_name')

In [None]:
actor_2_name_value_counts = dataset.actor_2_name.value_counts()
actor_2_name_value_counts  = pd.DataFrame(actor_2_name_value_counts).reset_index().rename(columns = {'index': 'actor_2_name', 'actor_2_name':'actor_2_name_value_counts'})

In [None]:
dataset = pd.merge(dataset, actor_2_name_value_counts,left_on = 'actor_2_name', right_on = 'actor_2_name', how = 'left')

In [None]:
dataset = dataset.drop(columns = 'actor_2_name')

In [None]:
dataset.genres.unique(), dataset.genres.nunique()

In [None]:
dataset['main_genre'] = dataset.genres.str.split('|').str[0]

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['main_genre'] = le.fit_transform(dataset.main_genre)

In [None]:
genres_value_counts = dataset.genres.value_counts()

In [None]:
genres_value_counts  = pd.DataFrame(genres_value_counts).reset_index().rename(columns = {'index' : 'genres', 'genres' : 'genres_value_counts'})

In [None]:
dataset = pd.merge(dataset, genres_value_counts,left_on = 'genres', right_on = 'genres', how = 'left')

In [None]:
dataset = dataset.drop(columns = 'genres')

In [None]:
dataset.actor_1_name.unique(), dataset.actor_1_name.nunique()

In [None]:
actor_1_name_value_counts = dataset.actor_1_name.value_counts()

In [None]:
actor_1_name_value_counts = pd.DataFrame(actor_1_name_value_counts).reset_index().rename(columns = {'index' : 'actor_1_name', 'actor_1_name' : 'actor_1_name_value_counts'})

In [None]:
dataset = pd.merge(dataset, actor_1_name_value_counts,left_on = 'actor_1_name', right_on = 'actor_1_name', how = 'left')

In [None]:
dataset = dataset.drop(columns = 'actor_1_name')

In [None]:
dataset = dataset.drop(columns = 'movie_title')

In [None]:
actor_3_name_value_counts = dataset.actor_3_name.value_counts()

In [None]:
actor_3_name_value_counts = pd.DataFrame(actor_3_name_value_counts).reset_index().rename(columns = {'index' : 'actor_3_name', 'actor_3_name' : 'actor_3_name_value_counts'})

In [None]:
dataset= pd.merge(dataset, actor_3_name_value_counts,left_on = 'actor_3_name', right_on = 'actor_3_name', how = 'left')

In [None]:
dataset = dataset.drop(columns = 'actor_3_name')

In [None]:
dataset['main_plot_keyword'] = dataset.plot_keywords.str.split('|').str[0]


In [None]:
dataset = dataset.drop(columns = 'plot_keywords')

In [None]:
main_plot_keyword_value_counts = dataset.main_plot_keyword.value_counts()
main_plot_keyword_value_counts = pd.DataFrame(main_plot_keyword_value_counts).reset_index().rename(columns = {'index' : 'main_plot_keyword', 'main_plot_keyword' : 'main_plot_keyword_value_counts'})

In [None]:
dataset = pd.merge(dataset, main_plot_keyword_value_counts, left_on = 'main_plot_keyword', right_on = 'main_plot_keyword', how = 'left')
dataset = dataset.drop(columns = 'main_plot_keyword')

In [None]:
dataset = dataset.drop(columns = 'movie_imdb_link')


In [None]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
dataset['language'] = le1.fit_transform(dataset.language)

In [None]:
from sklearn.preprocessing import LabelEncoder
le2 = LabelEncoder()
dataset['country'] = le2.fit_transform(dataset.country)

In [None]:
from sklearn.preprocessing import LabelEncoder
le3 = LabelEncoder()
dataset['content_rating'] = le3.fit_transform(dataset.content_rating)

In [None]:
dataset.head().T

In [None]:
datasetR = dataset.copy() #lets keep our original dataset for reference. Here datasetR is for Regression model
datasetC = dataset.copy() #Here datasetC is for classification model

In [None]:
from sklearn.model_selection import train_test_split
y = datasetR.pop('imdb_score')
X = datasetR
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train.values), columns=X_train.columns, index=X_train.index)

In [None]:
X_test = pd.DataFrame(scaler.transform(X_test.values), columns = X_train.columns, index = X_test.index)

In [None]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset
correlation(X_train,0.90)


In [None]:
#importing the required libraries
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
# Running RFE with the output number of the variable equal to 15
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 15)            # running RFE
rfe = rfe.fit(X_train, y_train)

In [None]:
col_rfe = X_train.columns[rfe.support_]
col_rfe

In [None]:
#Creating a X_train dataframe with rfe varianles
X_train_rfe = X_train[col_rfe]

In [None]:
# Adding a constant variable for using the stats model
import statsmodels.api as sm
X_train_rfe_constant = sm.add_constant(X_train_rfe)

In [None]:
lm = sm.OLS(y_train,X_train_rfe_constant).fit()   # Running the linear model
#Let's see the summary of our linear model
print(lm.summary())

In [None]:
X_test_rfe = X_test[col_rfe]
X_test_rfe_constant = sm.add_constant(X_test_rfe)
y_pred_linear = lm.predict(X_test_rfe_constant)
y_pred_linear.values

In [None]:
y_pred_linear.min(), y_pred_linear.max()

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred_linear, y_test)

In [None]:
from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf', gamma=0.1)
svr_lin = SVR(kernel='linear', gamma='auto')
svr_poly = SVR(kernel='poly', gamma='auto', degree=3)
svr_rbf.fit(X_train_rfe, y_train)
y_pred_svm_rbf = svr_rbf.predict(X_test_rfe)
y_pred_svm_rbf

In [None]:
y_pred_svm_rbf.min(), y_pred_svm_rbf.max()

In [None]:
mean_squared_error(y_pred_svm_rbf, y_test)

In [None]:
mean_squared_error(y_pred_svm_rbf, y_test)

In [None]:
svr_lin.fit(X_train_rfe, y_train)
y_pred_svm_lin = svr_lin.predict(X_test_rfe)