In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [50]:
def label_encode_columns(df, *columns):
    label_encoder = LabelEncoder()
    for col in columns:
        df[col] = label_encoder.fit_transform(df[col])
    return df

In [51]:
def drop_column(df, column):
    return df.drop(columns=[column])

In [52]:
def drop_columns(df:pd.DataFrame, *columns):
    for col in columns: 
        df =  df.drop(col, axis=1)   
    return df

In [53]:
df_train = pd.read_csv("titanic/train.csv", index_col="PassengerId")
df_test = pd.read_csv("titanic/test.csv", index_col="PassengerId")

In [54]:
df_train = df_train.pipe(label_encode_columns, "Sex", "Embarked").pipe(drop_columns, "Name", "Fare", "Cabin", "Ticket")
df_train.head(2)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,1,22.0,1,0,2
2,1,1,0,38.0,1,0,0


In [55]:
df_test = df_test.pipe(label_encode_columns, "Sex", "Embarked").pipe(drop_columns, "Name", "Fare", "Cabin", "Ticket")
df_test.head(2)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,1,34.5,0,0,1
893,3,0,47.0,1,0,2


In [56]:
df_test.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
count,418.0,418.0,332.0,418.0,418.0,418.0
mean,2.26555,0.636364,30.27259,0.447368,0.392344,1.401914
std,0.841838,0.481622,14.181209,0.89676,0.981429,0.854496
min,1.0,0.0,0.17,0.0,0.0,0.0
25%,1.0,0.0,21.0,0.0,0.0,1.0
50%,3.0,1.0,27.0,0.0,0.0,2.0
75%,3.0,1.0,39.0,1.0,0.0,2.0
max,3.0,1.0,76.0,8.0,9.0,2.0


In [57]:
df_test

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,1,34.5,0,0,1
893,3,0,47.0,1,0,2
894,2,1,62.0,0,0,1
895,3,1,27.0,0,0,2
896,3,0,22.0,1,1,2
...,...,...,...,...,...,...
1305,3,1,,0,0,2
1306,1,0,39.0,0,0,0
1307,3,1,38.5,0,0,2
1308,3,1,,0,0,2


In [58]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

In [60]:
scaled_df_test = standard_scaler.fit_transform(df_test)
type(scaled_df_test)

numpy.ndarray

In [None]:
pd.DataFrame(scaled_df_test)

Unnamed: 0,0,1,2,3,4,5
0,0.873482,0.755929,0.298549,-0.499470,-0.400248,-0.470915
1,0.873482,-1.322876,1.181328,0.616992,-0.400248,0.700767
2,-0.315819,0.755929,2.240662,-0.499470,-0.400248,-0.470915
3,0.873482,0.755929,-0.231118,-0.499470,-0.400248,0.700767
4,0.873482,-1.322876,-0.584229,0.616992,0.619896,0.700767
...,...,...,...,...,...,...
413,0.873482,0.755929,,-0.499470,-0.400248,0.700767
414,-1.505120,-1.322876,0.616350,-0.499470,-0.400248,-1.642598
415,0.873482,0.755929,0.581038,-0.499470,-0.400248,0.700767
416,0.873482,0.755929,,-0.499470,-0.400248,0.700767


In [63]:
scaled_df_test = pd.DataFrame(scaled_df_test, columns=df_test.keys())
scaled_df_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0.873482,0.755929,0.298549,-0.499470,-0.400248,-0.470915
1,0.873482,-1.322876,1.181328,0.616992,-0.400248,0.700767
2,-0.315819,0.755929,2.240662,-0.499470,-0.400248,-0.470915
3,0.873482,0.755929,-0.231118,-0.499470,-0.400248,0.700767
4,0.873482,-1.322876,-0.584229,0.616992,0.619896,0.700767
...,...,...,...,...,...,...
413,0.873482,0.755929,,-0.499470,-0.400248,0.700767
414,-1.505120,-1.322876,0.616350,-0.499470,-0.400248,-1.642598
415,0.873482,0.755929,0.581038,-0.499470,-0.400248,0.700767
416,0.873482,0.755929,,-0.499470,-0.400248,0.700767


In [64]:
scaled_df_test["Sex"].unique()

array([ 0.75592895, -1.32287566])

In [65]:
scaled_df_test["Embarked"].unique()

array([-0.47091535,  0.70076689, -1.64259759])

In [66]:
scaled_df_test["Age"].min(), scaled_df_test["Age"].max()

(np.float64(-2.125913789059458), np.float64(3.2293742671753187))