In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for Q-Q plots
import scipy.stats as stats
# from feature-engine
# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
    RandomSampleImputer
)

import plotly.express as px

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [2]:
data = pd.read_csv("tv_shows.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type
0,0,1,Breaking Bad,2008,18+,9.4/10,100/100,1,0,0,0,1
1,1,2,Stranger Things,2016,16+,8.7/10,96/100,1,0,0,0,1
2,2,3,Attack on Titan,2013,18+,9.0/10,95/100,1,1,0,0,1
3,3,4,Better Call Saul,2015,18+,8.8/10,94/100,1,0,0,0,1
4,4,5,Dark,2017,16+,8.8/10,93/100,1,0,0,0,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       5368 non-null   int64 
 1   ID               5368 non-null   int64 
 2   Title            5368 non-null   object
 3   Year             5368 non-null   int64 
 4   Age              3241 non-null   object
 5   IMDb             4406 non-null   object
 6   Rotten Tomatoes  5368 non-null   object
 7   Netflix          5368 non-null   int64 
 8   Hulu             5368 non-null   int64 
 9   Prime Video      5368 non-null   int64 
 10  Disney+          5368 non-null   int64 
 11  Type             5368 non-null   int64 
dtypes: int64(8), object(4)
memory usage: 503.4+ KB


In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type
0,0,1,Breaking Bad,2008,18+,9.4/10,100/100,1,0,0,0,1
1,1,2,Stranger Things,2016,16+,8.7/10,96/100,1,0,0,0,1
2,2,3,Attack on Titan,2013,18+,9.0/10,95/100,1,1,0,0,1
3,3,4,Better Call Saul,2015,18+,8.8/10,94/100,1,0,0,0,1
4,4,5,Dark,2017,16+,8.8/10,93/100,1,0,0,0,1


In [5]:
data.isnull().any()

Unnamed: 0         False
ID                 False
Title              False
Year               False
Age                 True
IMDb                True
Rotten Tomatoes    False
Netflix            False
Hulu               False
Prime Video        False
Disney+            False
Type               False
dtype: bool

In [6]:
print (data.dtypes)

Unnamed: 0          int64
ID                  int64
Title              object
Year                int64
Age                object
IMDb               object
Rotten Tomatoes    object
Netflix             int64
Hulu                int64
Prime Video         int64
Disney+             int64
Type                int64
dtype: object


In [7]:
#data[['Age','restAge']] = data['Age'].str.split("+", expand = True)
#data.head()
#Daten als Kategorien lassen, da es keine nummerische Werte sind sondern eher Kategorien

In [8]:
data[['IMDb','restIMDb']] = data['IMDb'].str.split("/", expand = True)
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,restIMDb
0,0,1,Breaking Bad,2008,18+,9.4,100/100,1,0,0,0,1,10
1,1,2,Stranger Things,2016,16+,8.7,96/100,1,0,0,0,1,10
2,2,3,Attack on Titan,2013,18+,9.0,95/100,1,1,0,0,1,10
3,3,4,Better Call Saul,2015,18+,8.8,94/100,1,0,0,0,1,10
4,4,5,Dark,2017,16+,8.8,93/100,1,0,0,0,1,10


In [9]:
data.rename(columns={"Rotten Tomatoes": "Rotten_Tomatoes"}, inplace=True)
data[['Rotten_Tomatoes','restRotten_Tomatoes']] = data['Rotten_Tomatoes'].str.split("/", expand = True)
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,restIMDb,restRotten_Tomatoes
0,0,1,Breaking Bad,2008,18+,9.4,100,1,0,0,0,1,10,100
1,1,2,Stranger Things,2016,16+,8.7,96,1,0,0,0,1,10,100
2,2,3,Attack on Titan,2013,18+,9.0,95,1,1,0,0,1,10,100
3,3,4,Better Call Saul,2015,18+,8.8,94,1,0,0,0,1,10,100
4,4,5,Dark,2017,16+,8.8,93,1,0,0,0,1,10,100


In [10]:
data.Type.value_counts()

1    5368
Name: Type, dtype: int64

In [11]:
data = data.drop(columns = ['restIMDb'])
data = data.drop(columns = ['restRotten_Tomatoes'])
data = data.drop(columns = ['Type'])
data = data.drop(columns = ['Unnamed: 0'])
data = data.drop(columns = ['ID'])
data = data.drop(columns = ['Title'])
data.head()

Unnamed: 0,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,2008,18+,9.4,100,1,0,0,0
1,2016,16+,8.7,96,1,0,0,0
2,2013,18+,9.0,95,1,1,0,0
3,2015,18+,8.8,94,1,0,0,0
4,2017,16+,8.8,93,1,0,0,0


In [12]:
data.dtypes

Year                int64
Age                object
IMDb               object
Rotten_Tomatoes    object
Netflix             int64
Hulu                int64
Prime Video         int64
Disney+             int64
dtype: object

In [13]:
data = data.astype({'IMDb':'float'})
print(data.dtypes)

Year                 int64
Age                 object
IMDb               float64
Rotten_Tomatoes     object
Netflix              int64
Hulu                 int64
Prime Video          int64
Disney+              int64
dtype: object


In [14]:
data = data.astype({'Rotten_Tomatoes':'float'})
print(data.dtypes)

Year                 int64
Age                 object
IMDb               float64
Rotten_Tomatoes    float64
Netflix              int64
Hulu                 int64
Prime Video          int64
Disney+              int64
dtype: object


In [15]:
data.isnull().mean()

Year               0.000000
Age                0.396237
IMDb               0.179210
Rotten_Tomatoes    0.000000
Netflix            0.000000
Hulu               0.000000
Prime Video        0.000000
Disney+            0.000000
dtype: float64

In [16]:
data.columns[data.isnull().any()]

Index(['Age', 'IMDb'], dtype='object')

In [17]:
sel = [col for col in data.columns if data[col].isnull().any()]
data[sel]

Unnamed: 0,Age,IMDb
0,18+,9.4
1,16+,8.7
2,18+,9.0
3,18+,8.8
4,16+,8.8
...,...,...
5363,,
5364,,
5365,,
5366,,


In [18]:
data.shape

(5368, 8)

In [19]:
data.head()

Unnamed: 0,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,2008,18+,9.4,100.0,1,0,0,0
1,2016,16+,8.7,96.0,1,0,0,0
2,2013,18+,9.0,95.0,1,1,0,0
3,2015,18+,8.8,94.0,1,0,0,0
4,2017,16+,8.8,93.0,1,0,0,0


In [20]:
data['Age'].isnull().mean()

0.39623695976154993

In [21]:
imputer = AddMissingIndicator(missing_only=True,
                              variables=['Age'])
imputer.fit(data)
imputer.variables_

data = imputer.transform(data)

data.head()


Unnamed: 0,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+,Age_na
0,2008,18+,9.4,100.0,1,0,0,0,0
1,2016,16+,8.7,96.0,1,0,0,0,0
2,2013,18+,9.0,95.0,1,1,0,0,0
3,2015,18+,8.8,94.0,1,0,0,0,0
4,2017,16+,8.8,93.0,1,0,0,0,0


In [22]:
data = data.drop(columns = ['Age'])
data.head()

Unnamed: 0,Year,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+,Age_na
0,2008,9.4,100.0,1,0,0,0,0
1,2016,8.7,96.0,1,0,0,0,0
2,2013,9.0,95.0,1,1,0,0,0
3,2015,8.8,94.0,1,0,0,0,0
4,2017,8.8,93.0,1,0,0,0,0


In [40]:
X = data.drop(columns = ['Age_na'])
y = data['Age_na']
print("The shape of the data set with training varialbes is: {}".format(X.shape))
print("The shape of the target variable is: {}".format(y.shape))

The shape of the data set with training varialbes is: (5368, 7)
The shape of the target variable is: (5368,)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=0)

print("The shape of the training sample is: {}".format(X_train.shape))
print("The shape of the test sample is: {}".format(X_test.shape))

The shape of the training sample is: (4294, 7)
The shape of the test sample is: (1074, 7)


In [42]:
print("In the training data set, we have missing values in the following variables: {}".format(X_train.columns[X.isnull().any()]))
print("In the test data set, we have missing values in the following variables: {}".format(X_test.columns[X.isnull().any()]))

In the training data set, we have missing values in the following variables: Index(['IMDb'], dtype='object')
In the test data set, we have missing values in the following variables: Index(['IMDb'], dtype='object')


In [43]:
y_train_missing = y_train
y_test_missing = y_test

In [44]:
fig = px.histogram(X_train, x = "IMDb")
fig.show()

In [45]:
len(X_train.IMDb.unique())

78

In [46]:
imputer_IMDb = RandomSampleImputer(random_state=1,
                            variables=['IMDb'])

In [47]:
X_train.shape

(4294, 7)

In [48]:
imputer_IMDb.fit(X_train)

In [49]:
imputer_IMDb.variables

['IMDb']

In [50]:
X_train.shape

(4294, 7)

In [51]:
X_train = imputer_IMDb.transform(X_train)
X_test = imputer_IMDb.transform(X_test)

X_train.head()

Unnamed: 0,Year,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+
4168,2015,6.2,38.0,0,0,1,0
2141,1988,7.0,73.0,0,1,1,0
271,2016,7.2,71.0,1,0,0,0
4848,2017,6.8,10.0,0,0,1,0
2749,2021,6.0,52.0,0,1,0,0


In [52]:
print("Variables with missing values in train: {}".format(X_train.columns[X_train.isnull().any()]))
print("Variables with missing values in test: {}".format(X_test.columns[X_test.isnull().any()]))

Variables with missing values in train: Index([], dtype='object')
Variables with missing values in test: Index([], dtype='object')


In [53]:
X_train.to_csv("Xtrain_tvshows.csv",index=False)
X_test.to_csv("Xtest_tvshows.csv",index=False)
y_train_missing.to_csv("ytrain_tvshows.csv",index=False)
y_test_missing.to_csv("ytest_tvshows.csv",index=False)