In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("C:/Users/Sophie/Desktop/movie_metadata.csv")

In [3]:
df.color.replace(to_replace=['Color', ' Black and White'], value=[0, 1], inplace=True)
df['color'] = df['color'].astype(float)

In [4]:
df = df.select_dtypes(include=['float64', 'int64', 'int'])
df = df.dropna().astype(float)

# Feature Selection

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3800 entries, 0 to 5042
Data columns (total 17 columns):
color                        3800 non-null float64
num_critic_for_reviews       3800 non-null float64
duration                     3800 non-null float64
director_facebook_likes      3800 non-null float64
actor_3_facebook_likes       3800 non-null float64
actor_1_facebook_likes       3800 non-null float64
gross                        3800 non-null float64
num_voted_users              3800 non-null float64
cast_total_facebook_likes    3800 non-null float64
facenumber_in_poster         3800 non-null float64
num_user_for_reviews         3800 non-null float64
budget                       3800 non-null float64
title_year                   3800 non-null float64
actor_2_facebook_likes       3800 non-null float64
imdb_score                   3800 non-null float64
aspect_ratio                 3800 non-null float64
movie_facebook_likes         3800 non-null float64
dtypes: float64(17)
memory

We need to remove certain features that should not be part of our algorithm as they could affect the performance of a model. The aspect ratio and the number of faces in a movie’s poster should not be part of the analysis as they are irrelevant.

In [6]:
df.drop(['aspect_ratio', 'facenumber_in_poster'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3800 entries, 0 to 5042
Data columns (total 15 columns):
color                        3800 non-null float64
num_critic_for_reviews       3800 non-null float64
duration                     3800 non-null float64
director_facebook_likes      3800 non-null float64
actor_3_facebook_likes       3800 non-null float64
actor_1_facebook_likes       3800 non-null float64
gross                        3800 non-null float64
num_voted_users              3800 non-null float64
cast_total_facebook_likes    3800 non-null float64
num_user_for_reviews         3800 non-null float64
budget                       3800 non-null float64
title_year                   3800 non-null float64
actor_2_facebook_likes       3800 non-null float64
imdb_score                   3800 non-null float64
movie_facebook_likes         3800 non-null float64
dtypes: float64(15)
memory usage: 475.0 KB


As seen in the heatmap in our exploratory data analysis, some of the features in the dataset are correlated.  The correlated features must be converted into linearly unrrelated variables using Principal Component Analysis (PCA). Moreover, training on a dataset with a large number of features like ours can also result in a model with a high rate of errors as it's more prone to bias. With the PCA, we are able to reduce the dimensions of the data while losing only a small amount information.

In [7]:
# Distributing the datset into two components X and y

X = df.drop('imdb_score', axis=1).values
y = df['imdb_score'].values

In [8]:
# Feature Scaling 
# Preprocessing the data by fitting a scaler on the dataset

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X = sc.fit_transform(X)

In [9]:
from sklearn.decomposition import PCA

# Creating a PCA that will retain 95% of the variance

pca = PCA(.95)

# Applying the PCA on the dataset

X_pca = pca.fit_transform(X)

In [10]:
# Results

print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_pca.shape[1])

Original number of features: 14
Reduced number of features: 11
