In [118]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# %matplotlib inline

excel_file_path = "../EDA/Titanic_EDA/train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [119]:
# preparing training and test data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df.drop(["Survived", "Cabin"], axis=1), df["Survived"], test_size=0.3
)
x_train.shape, x_test.shape

((623, 10), (268, 10))

In [120]:
x_train.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            117
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         1
dtype: int64

In [121]:
x_train['Pclass']

692    3
112    3
58     2
679    1
718    3
      ..
805    3
391    3
517    3
212    3
561    3
Name: Pclass, Length: 623, dtype: int64

In [122]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

transformer = ColumnTransformer(
    transformers=[
        ("tnf1", SimpleImputer(), ["Age"]),
        ("tnf2", OrdinalEncoder(categories=[[1, 2, 3]]), ["Pclass"]),
        ('tnf3', OneHotEncoder(sparse=False, drop='first'),['Sex'])
    ],
    remainder="passthrough",
    verbose=True
)

In [123]:
x_train_transformed = transformer.fit_transform(x_train)
print(x_train.shape, x_train_transformed.shape)
# Convert the transformed data to a DataFrame
x_train_transformed_df = pd.DataFrame(x_train_transformed, columns=transformer.get_feature_names_out(x_train.columns))
x_train = pd.concat([x_train,x_train_transformed_df])
x_train.sample(5)

[ColumnTransformer] .......... (1 of 4) Processing tnf1, total=   0.0s
[ColumnTransformer] .......... (2 of 4) Processing tnf2, total=   0.0s
[ColumnTransformer] .......... (3 of 4) Processing tnf3, total=   0.0s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s
(623, 10) (623, 10)




Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,tnf1__Age,tnf2__Pclass,tnf3__Sex_male,remainder__PassengerId,remainder__Name,remainder__SibSp,remainder__Parch,remainder__Ticket,remainder__Fare,remainder__Embarked
869,870.0,3.0,"Johnson, Master. Harold Theodor",male,4.0,1.0,1.0,347742,11.1333,S,,,,,,,,,,
513,,,,,,,,,,,24.0,1.0,0.0,317.0,"Kantor, Mrs. Sinai (Miriam Sternin)",1.0,0.0,244367.0,26.0,S
497,498.0,3.0,"Shellard, Mr. Frederick William",male,,0.0,0.0,C.A. 6212,15.1,S,,,,,,,,,,
511,512.0,3.0,"Webber, Mr. James",male,,0.0,0.0,SOTON/OQ 3101316,8.05,S,,,,,,,,,,
740,741.0,1.0,"Hawksford, Mr. Walter James",male,,0.0,0.0,16988,30.0,S,,,,,,,,,,
