In [111]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# %matplotlib inline

excel_file_path = "../EDA/Titanic_EDA/train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [112]:
# preparing training and test data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df.drop(["Survived", "Cabin", "PassengerId", "Name", "Ticket"], axis=1),
    df["Survived"],
    test_size=0.3,
)
x_train.shape, x_test.shape

((623, 7), (268, 7))

In [113]:
x_train.isnull().sum()

Pclass        0
Sex           0
Age         128
SibSp         0
Parch         0
Fare          0
Embarked      1
dtype: int64

In [114]:
x_train["Pclass"]

70     2
852    3
136    1
580    2
367    3
      ..
460    1
528    3
679    1
267    3
470    3
Name: Pclass, Length: 623, dtype: int64

In [115]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

transformer = ColumnTransformer(
    transformers=[
        ("tnf1", SimpleImputer(), ["Age"]),
        ("tnf2", SimpleImputer(strategy='most_frequent'), ["Embarked"]),
        ("tnf3", OrdinalEncoder(categories=[sorted(df['Pclass'].unique())]), ["Pclass"]),
        ("tnf4", OneHotEncoder(sparse=False, drop="first", handle_unknown='ignore'), ["Sex", "Embarked"]),
    ],
    remainder="passthrough",
    verbose=True,
)

In [116]:
x_train_transformed = transformer.fit_transform(x_train)
print(x_train.shape, x_train_transformed.shape)
# Convert the transformed data to a DataFrame
# It specifies the column names for the new DataFrame
x_train_transformed_df = pd.DataFrame(
    x_train_transformed, 
    columns=transformer.get_feature_names_out(x_train.columns)
)
df = pd.concat([x_train, x_train_transformed_df], axis=1)
df.sample(5)

[ColumnTransformer] .......... (1 of 5) Processing tnf1, total=   0.0s
[ColumnTransformer] .......... (2 of 5) Processing tnf2, total=   0.0s
[ColumnTransformer] .......... (3 of 5) Processing tnf3, total=   0.0s
[ColumnTransformer] .......... (4 of 5) Processing tnf4, total=   0.0s
[ColumnTransformer] ..... (5 of 5) Processing remainder, total=   0.0s
(623, 7) (623, 10)




Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,tnf1__Age,tnf2__Embarked,tnf3__Pclass,tnf4__Sex_male,tnf4__Embarked_Q,tnf4__Embarked_S,tnf4__Embarked_nan,remainder__SibSp,remainder__Parch,remainder__Fare
446,2.0,female,13.0,0.0,1.0,19.5,S,36.0,S,0.0,1.0,0.0,1.0,0.0,1.0,0.0,78.85
577,,,,,,,,29.827616,S,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
678,3.0,female,43.0,1.0,6.0,46.9,S,,,,,,,,,,
517,3.0,male,,0.0,0.0,24.15,Q,21.0,S,2.0,1.0,0.0,1.0,0.0,0.0,0.0,7.925
441,3.0,male,20.0,0.0,0.0,9.5,S,31.0,S,0.0,1.0,0.0,1.0,0.0,0.0,0.0,50.4958
