In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Understand the dataset**

In [None]:
df = pd.read_csv("/kaggle/input/imdb-5000-movie-dataset/movie_metadata.csv")
df.head(10)

In [None]:
df.count()

In [None]:
print(df.isna().sum())
print(max(df.isna().sum()))

# Data Filtering and Identifying correlation

Let us remove some string features such as director_name, movie_title actor names etc, as they are correlated to facebook likes of director, movie title, and actors. They would not have much effect in improving the model

In [None]:
df.drop(["director_name","actor_2_name","genres","movie_title","actor_1_name","actor_3_name","language","country","content_rating"],axis=1, inplace = True)

In [None]:
df.drop(["plot_keywords","movie_imdb_link"],axis=1, inplace = True)

Let us also remove `cast_total_facebook_likes` as it is the sum of `actor_1_facebook_likes`,`actor_2_facebook_likes` and `actor_3_facebook_likes`, it is correlated to these and hence redundant

In [None]:
df

In [None]:
df["color"].value_counts()

In [None]:
df.isna().sum()

Drop columns with no color value specified

In [None]:
df.dropna(subset = ["color"], axis=0, inplace=True)

In [None]:
df

In [None]:
df.isna().sum()

We're dropping all `NaN` rows as imputing these will result in a bad dataset as the elements in the column are not in an order or fashion but depend solely on other elements of the row

In [None]:
df.dropna(how="any",axis=0,inplace = True)

In [None]:
df

In [None]:
df.isna().sum()

Now we have no NaN values

# **Encoding categorical data and Data Standardization**

We will first seperate target from data

In [None]:
target = df["imdb_score"]
data = df.drop(["imdb_score"],axis=1)
target.head(10)

In [None]:
data.head(10)

Now we encode the categorical color feature to numeric

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
labelencoder = LabelEncoder()
data["new_color"] = labelencoder.fit_transform(data["color"])
data["new_color"]

In [None]:
data.drop(["color"],axis=1,inplace=True)

In [None]:
data

Let us scale all the rows using standardizing, as some fields like `gross` may cause the model to skew towards them.

In [None]:
from sklearn.preprocessing import scale

In [None]:
scaled_data = scale(data)
scaled_data

# **Modeling and Evaluation**

We will now split the data to training and test sets

In [None]:


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled_data, target, test_size = 0.25)

Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

#training the model (fitting data to the model)
model = LinearRegression()
model.fit(x_train,y_train)

In [None]:
#predicting and evaluating the model

y = model.predict(x_test)

predictions = pd.DataFrame({"Actual":y_test, "Predicted":y})
predictions

In [None]:
# Evaluating the model

from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y)))

We scale the model to the range (0,1) to get a better understanding of the accuracy

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
mmscaler = MinMaxScaler()
scaled_targets = mmscaler.fit_transform(predictions)
scaled_targets

In [None]:
scaled_targets = pd.DataFrame(scaled_targets,columns = ["Actual","Predicted"])
scaled_targets.head(10)

In [None]:

print('Mean Absolute Error:', metrics.mean_absolute_error(scaled_targets["Actual"], scaled_targets["Predicted"]))  
print('Mean Squared Error:', metrics.mean_squared_error(scaled_targets["Actual"], scaled_targets["Predicted"]))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(scaled_targets["Actual"], scaled_targets["Predicted"])))

In [None]:
model.score(x_test,y_test)

Let us try to evaluate it with other models to choose the best model.

In [None]:
from sklearn import linear_model

### Ridge Regression

In [None]:
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6,0.7,0.8,0.9]
reg_cv = linear_model.RidgeCV(alphas=alphas)
reg_cv.fit(x_train, y_train)

In [None]:
print("Chosen alpha:",reg_cv.alpha_)

In [None]:
y = reg_cv.predict(x_test)

predictions = pd.DataFrame({"Actual":y_test, "Predicted":y})
predictions

In [None]:
reg_cv.score(x_test,y_test)

Ridge regression is giving almost the same score as Linear Regression

### Lasso Regression

In [None]:
lasso = linear_model.LassoCV(alphas = alphas)
lasso.fit(x_train,y_train)

The data is not fitting well and the model is failing to converge, so we increase number of iterations.

In [None]:
lasso = linear_model.LassoCV(alphas = alphas, max_iter = 10000)
lasso.fit(x_train,y_train)

In [None]:
y = lasso.predict(x_test)

predictions = pd.DataFrame({"Actual":y_test, "Predicted":y})
predictions

In [None]:
lasso.score(x_test,y_test)

This score is less than Ridge Regression and Linear Regression

### Bayesian Regression

In [None]:
bayesian = linear_model.BayesianRidge()
bayesian.fit(x_train,y_train)
bayesian.score(x_test,y_test)

In [None]:
y = lasso.predict(x_test)

predictions = pd.DataFrame({"Actual":y_test, "Predicted":y})
predictions

This model is also performing closely to Linear regression and Ridge regression

Let us try Decision Tree Regression

### Decision Tree Regression


In [None]:
from sklearn import tree
dtree = tree.DecisionTreeRegressor(max_depth = 7)
dtree.fit(x_train,y_train)

y = dtree.predict(x_test)

predictions = pd.DataFrame({"Actual":y_test, "Predicted":y})
predictions

In [None]:
dtree.score(x_test,y_test)

Let's apply decision tree for depths 1-15 and check which depth gives the best result

In [None]:
l = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

def evaluateModel(d):
    dtree = tree.DecisionTreeRegressor(max_depth = d)
    dtree.fit(x_train,y_train)
    return dtree.score(x_test,y_test)

scores = []
for d in l:
    scores.append(evaluateModel(d))
scores

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(np.array(l),np.array(scores))
plt.show()

We see that the score is increaing upto 8, where it is maximum, and then it is decreasing. We found that `max_depth = 4 to 7` is giving the best model for Decision Tree Regression upto 35-39% accuracy which is good enough