In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Loading the dataset

In [None]:
df = pd.DataFrame(pd.read_csv('/kaggle/input/movie-metadatacsv/movie_metadata.csv'))
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
# Check which column contains null value
df.isnull().any()

In [None]:
# % of null items in the column 
(df.isnull().sum()/df.shape[0])*100

Percentage of Nan value is very less, so we can replace the Nan with 0

In [None]:
df = df.fillna(value=df.mean())
df.shape

Now we'll find the correlation between the attributes and target variable (IMDB score) using heatmap

Now, if we apply heatmap to the whole dataset, it'll create heatmap of columns with dtype int or float (excluding object)

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(df.corr(), annot = True)

In [None]:
# Simple function to get the name of most correlated attributes
def get_correlated_col(cor_data, threshold):
    #cor_data to be column along which corelation to be measured
    #threshold be the value above which of correlation to be considered
    feature=[]
    value=[]
    
    for i, index in enumerate(cor_data.index):
        if abs(cor_data[index]) > threshold:
            feature.append(index)
            value.append(cor_data[index])
    
    df = pd.DataFrame(data=value, index = feature, columns= ['corr value'])
    return df
    


In [None]:
top_correlated_value = get_correlated_col(df.corr()['imdb_score'], 0.2)
top_correlated_value

In [None]:
final_df = df[top_correlated_value.index]
final_df

Now lets work with columns with dtype of object

In [None]:
cat_df = df.select_dtypes(include=['object'])
cat_df.head()

In [None]:
#Changing categorical values(Object) into numerical value using Encoder
from sklearn.preprocessing import LabelEncoder 

cat_col = cat_df.columns
for i in cat_col:
    enc = LabelEncoder()
    cat_df[i] = enc.fit_transform(cat_df[i].astype('str'))

In [None]:
cat_df.head()

In [None]:
# Adding target column in the cat_df to get the correlation
cat_df['imdb_score'] = df['imdb_score'] 

In [None]:
cat_df.head()

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(cat_df.corr(), annot = True)

In [None]:
#Adding correlated attribute to final_df
final_df['color'] = cat_df['color']
final_df['director_name'] = cat_df['director_name']
final_df['genres'] = cat_df['genres']
final_df['language'] = cat_df['language']


In [None]:
final_df.head()

In [None]:
#spliting attributes and target 
X = final_df.drop(['imdb_score'], axis=1)
y = final_df['imdb_score']

In [None]:
X.head()

In [None]:
X.shape

In [None]:
#Now we'll transform features by scaling each feature to a given range (0 to 1)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

In [None]:
#Now lets split data in test train pairs in 0.7/0.3
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [None]:
#Fitting the model
from sklearn.linear_model import LinearRegression

LinearReg = LinearRegression()
LinearReg.fit(X_train, y_train)

In [None]:
#Prediction
y_pred = LinearReg.predict(X_test)
pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
pred_df.head()

In [None]:
#Evaluating the Model

from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

The mean error in our prediction is less than 15% which is acceptable.