In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for Q-Q plots
import scipy.stats as stats
# from feature-engine
# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

import plotly.express as px

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [16]:
data = pd.read_csv("tv_shows.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type
0,0,1,Breaking Bad,2008,18+,9.4/10,100/100,1,0,0,0,1
1,1,2,Stranger Things,2016,16+,8.7/10,96/100,1,0,0,0,1
2,2,3,Attack on Titan,2013,18+,9.0/10,95/100,1,1,0,0,1
3,3,4,Better Call Saul,2015,18+,8.8/10,94/100,1,0,0,0,1
4,4,5,Dark,2017,16+,8.8/10,93/100,1,0,0,0,1


In [17]:
data.rename(columns={"Rotten Tomatoes": "Rotten_Tomatoes"}, inplace=True)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       5368 non-null   int64 
 1   ID               5368 non-null   int64 
 2   Title            5368 non-null   object
 3   Year             5368 non-null   int64 
 4   Age              3241 non-null   object
 5   IMDb             4406 non-null   object
 6   Rotten_Tomatoes  5368 non-null   object
 7   Netflix          5368 non-null   int64 
 8   Hulu             5368 non-null   int64 
 9   Prime Video      5368 non-null   int64 
 10  Disney+          5368 non-null   int64 
 11  Type             5368 non-null   int64 
dtypes: int64(8), object(4)
memory usage: 503.4+ KB


In [19]:
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type
0,0,1,Breaking Bad,2008,18+,9.4/10,100/100,1,0,0,0,1
1,1,2,Stranger Things,2016,16+,8.7/10,96/100,1,0,0,0,1
2,2,3,Attack on Titan,2013,18+,9.0/10,95/100,1,1,0,0,1
3,3,4,Better Call Saul,2015,18+,8.8/10,94/100,1,0,0,0,1
4,4,5,Dark,2017,16+,8.8/10,93/100,1,0,0,0,1


In [20]:
data.isnull().any()

Unnamed: 0         False
ID                 False
Title              False
Year               False
Age                 True
IMDb                True
Rotten_Tomatoes    False
Netflix            False
Hulu               False
Prime Video        False
Disney+            False
Type               False
dtype: bool

In [21]:
print (data.dtypes)

Unnamed: 0          int64
ID                  int64
Title              object
Year                int64
Age                object
IMDb               object
Rotten_Tomatoes    object
Netflix             int64
Hulu                int64
Prime Video         int64
Disney+             int64
Type                int64
dtype: object


In [22]:
data[['Age','restAge']] = data['Age'].str.split("+", expand = True)
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,restAge
0,0,1,Breaking Bad,2008,18,9.4/10,100/100,1,0,0,0,1,
1,1,2,Stranger Things,2016,16,8.7/10,96/100,1,0,0,0,1,
2,2,3,Attack on Titan,2013,18,9.0/10,95/100,1,1,0,0,1,
3,3,4,Better Call Saul,2015,18,8.8/10,94/100,1,0,0,0,1,
4,4,5,Dark,2017,16,8.8/10,93/100,1,0,0,0,1,


In [23]:
data[['IMDb','restIMDb']] = data['IMDb'].str.split("/", expand = True)
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,restAge,restIMDb
0,0,1,Breaking Bad,2008,18,9.4,100/100,1,0,0,0,1,,10
1,1,2,Stranger Things,2016,16,8.7,96/100,1,0,0,0,1,,10
2,2,3,Attack on Titan,2013,18,9.0,95/100,1,1,0,0,1,,10
3,3,4,Better Call Saul,2015,18,8.8,94/100,1,0,0,0,1,,10
4,4,5,Dark,2017,16,8.8,93/100,1,0,0,0,1,,10


In [24]:
data[['Rotten_Tomatoes','restRotten_Tomatoes']] = data['Rotten_Tomatoes'].str.split("/", expand = True)
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,restAge,restIMDb,restRotten_Tomatoes
0,0,1,Breaking Bad,2008,18,9.4,100,1,0,0,0,1,,10,100
1,1,2,Stranger Things,2016,16,8.7,96,1,0,0,0,1,,10,100
2,2,3,Attack on Titan,2013,18,9.0,95,1,1,0,0,1,,10,100
3,3,4,Better Call Saul,2015,18,8.8,94,1,0,0,0,1,,10,100
4,4,5,Dark,2017,16,8.8,93,1,0,0,0,1,,10,100


In [25]:
data = data.drop(columns = ['restAge'])
data = data.drop(columns = ['restIMDb'])
data = data.drop(columns = ['restRotten_Tomatoes'])
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type
0,0,1,Breaking Bad,2008,18,9.4,100,1,0,0,0,1
1,1,2,Stranger Things,2016,16,8.7,96,1,0,0,0,1
2,2,3,Attack on Titan,2013,18,9.0,95,1,1,0,0,1
3,3,4,Better Call Saul,2015,18,8.8,94,1,0,0,0,1
4,4,5,Dark,2017,16,8.8,93,1,0,0,0,1


In [36]:
data.dtypes

Unnamed: 0          int64
ID                  int64
Title              object
Year                int64
Age                object
IMDb               object
Rotten_Tomatoes    object
Netflix             int64
Hulu                int64
Prime Video         int64
Disney+             int64
Type                int64
dtype: object

In [39]:
data = data.astype({'IMDb':'float'})
print(data.dtypes)

Unnamed: 0           int64
ID                   int64
Title               object
Year                 int64
Age                 object
IMDb               float64
Rotten_Tomatoes     object
Netflix              int64
Hulu                 int64
Prime Video          int64
Disney+              int64
Type                 int64
dtype: object


In [40]:
data = data.astype({'Rotten_Tomatoes':'float'})
print(data.dtypes)

Unnamed: 0           int64
ID                   int64
Title               object
Year                 int64
Age                 object
IMDb               float64
Rotten_Tomatoes    float64
Netflix              int64
Hulu                 int64
Prime Video          int64
Disney+              int64
Type                 int64
dtype: object


In [26]:
data.isnull().mean()

Unnamed: 0         0.000000
ID                 0.000000
Title              0.000000
Year               0.000000
Age                0.396237
IMDb               0.179210
Rotten_Tomatoes    0.000000
Netflix            0.000000
Hulu               0.000000
Prime Video        0.000000
Disney+            0.000000
Type               0.000000
dtype: float64

In [27]:
data.columns[data.isnull().any()]

Index(['Age', 'IMDb'], dtype='object')

In [28]:
sel = [col for col in data.columns if data[col].isnull().any()]
data[sel]

Unnamed: 0,Age,IMDb
0,18,9.4
1,16,8.7
2,18,9.0
3,18,8.8
4,16,8.8
...,...,...
5363,,
5364,,
5365,,
5366,,


In [29]:
data.shape

(5368, 12)

In [30]:
X = data.drop(columns = ['Title'])
y = data['Title']
print("The shape of the data set with training varialbes is: {}".format(X.shape))
print("The shape of the target variable is: {}".format(y.shape))

The shape of the data set with training varialbes is: (5368, 11)
The shape of the target variable is: (5368,)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=0)

print("The shape of the training sample is: {}".format(X_train.shape))
print("The shape of the test sample is: {}".format(X_test.shape))

The shape of the training sample is: (4294, 11)
The shape of the test sample is: (1074, 11)


In [32]:
print("In the training data set, we have missing values in the following variables: {}".format(X_train.columns[X.isnull().any()]))
print("In the test data set, we have missing values in the following variables: {}".format(X_test.columns[X.isnull().any()]))

In the training data set, we have missing values in the following variables: Index(['Age', 'IMDb'], dtype='object')
In the test data set, we have missing values in the following variables: Index(['Age', 'IMDb'], dtype='object')


In [None]:
fig = px.histogram(X_train, x = "Age")
fig.show()

In [34]:
len(X_train.Age.unique())

6

In [35]:
X_train[['Age']].value_counts(normalize = True)

Age
16     0.301205
18     0.268169
7      0.260785
all    0.166731
13     0.003109
dtype: float64

In [16]:
imputer = AddMissingIndicator(missing_only=True,
                              variables=['Age'])

In [17]:
imputer.fit(X_train)

In [18]:
imputer.variables_

['Age']

In [19]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Hulu,Prime Video,Disney+,Type,Age_na
4771,4771,5089,Curious?,2017,,,13/100,0,1,0,1,1
1239,1239,1242,Prank Encounters,2019,16+,5.2/10,48/100,0,0,0,1,0
3662,3662,3885,Dilbert,1999,7+,7.3/10,58/100,0,1,0,1,0
2838,2838,2953,Little Mosque on the Prairie,2007,7+,5.7/10,49/100,1,0,0,1,0
2635,2635,2736,The Titan Games,2019,all,7.0/10,55/100,1,0,0,1,0


In [20]:
X_train[['Age','Age_na']].isnull().mean()

Age       0.39473
Age_na    0.00000
dtype: float64

In [21]:
fig = px.histogram(X_train, x = "IMDb")
fig.show()

In [22]:
len(X_train.IMDb.unique())

77

In [23]:
imputer_IMDb = AddMissingIndicator(missing_only=True,
                              variables=['IMDb'])

In [24]:
X_train.shape

(3757, 12)

In [25]:
imputer_IMDb.fit(X_train)

In [26]:
imputer_IMDb.variables

['IMDb']

In [27]:
X_train.shape

(3757, 12)

In [28]:
X_train = imputer_IMDb.transform(X_train)
X_test = imputer_IMDb.transform(X_test)

X_train.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Hulu,Prime Video,Disney+,Type,Age_na,IMDb_na
4771,4771,5089,Curious?,2017,,,13/100,0,1,0,1,1,1
1239,1239,1242,Prank Encounters,2019,16+,5.2/10,48/100,0,0,0,1,0,0
3662,3662,3885,Dilbert,1999,7+,7.3/10,58/100,0,1,0,1,0,0
2838,2838,2953,Little Mosque on the Prairie,2007,7+,5.7/10,49/100,1,0,0,1,0,0
2635,2635,2736,The Titan Games,2019,all,7.0/10,55/100,1,0,0,1,0,0


In [29]:
print("Variables with missing values in train: {}".format(X_train.columns[X_train.isnull().any()]))
print("Variables with missing values in test: {}".format(X_test.columns[X_test.isnull().any()]))

Variables with missing values in train: Index(['Age', 'IMDb'], dtype='object')
Variables with missing values in test: Index(['Age', 'IMDb'], dtype='object')


In [30]:
X_train.to_csv("Xtrain_tvshows.csv",index=False)
X_test.to_csv("Xtest_tvshows.csv",index=False)
y_train.to_csv("ytrain_tvshows.csv",index=False)
y_test.to_csv("ytest_tvshows.csv",index=False)