# Titanic data preprocessing using Scikit-learn

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn import set_config

# Fetch the Titanic dataset
titanic = fetch_openml("titanic", version=1, as_frame=True)

# Print details about the dataset
pprint(titanic.details)

{'default_target_attribute': 'survived',
 'description_version': '9',
 'file_id': '16826755',
 'format': 'ARFF',
 'id': '40945',
 'licence': 'Public',
 'md5_checksum': '60ac7205eee0ba5045c90b3bba95b1c4',
 'minio_url': 'https://openml1.win.tue.nl/datasets/0004/40945/dataset_40945.pq',
 'name': 'Titanic',
 'parquet_url': 'https://openml1.win.tue.nl/datasets/0004/40945/dataset_40945.pq',
 'processing_date': '2018-10-04 07:19:36',
 'status': 'active',
 'tag': ['Computational Universe', 'Manufacturing', 'text_data'],
 'upload_date': '2017-10-16T01:17:36',
 'url': 'https://api.openml.org/data/v1/download/16826755/Titanic.arff',
 'version': '1',
 'visibility': 'public'}


In [2]:
# Initialize the dataframe
df = titanic.frame

# Print the first 5 rows
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   category
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1308 non-null   float64 
 9   cabin      295 non-null    object  
 10  embarked   1307 non-null   category
 11  boat       486 non-null    object  
 12  body       121 non-null    float64 
 13  home.dest  745 non-null    object  
dtypes: category(3), float64(3), int64(3), object(5)
memory usage: 116.8+ KB


## Create a feature matrix and target column

In [3]:
train_data, test_data = train_test_split(
    df,
    test_size=0.2,
    stratify=df['pclass'],
    random_state=18
)

titanic_train = train_data.copy()
titanic_test = test_data.copy()

print(f"Train set has dimensions {titanic_train.shape}")
print(f"Test set has dimensions {titanic_test.shape} ")
print('\n------------------------------------------\n')
titanic_train.info()

Train set has dimensions (1047, 14)
Test set has dimensions (262, 14) 

------------------------------------------

<class 'pandas.core.frame.DataFrame'>
Index: 1047 entries, 448 to 1210
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1047 non-null   int64   
 1   survived   1047 non-null   category
 2   name       1047 non-null   object  
 3   sex        1047 non-null   category
 4   age        838 non-null    float64 
 5   sibsp      1047 non-null   int64   
 6   parch      1047 non-null   int64   
 7   ticket     1047 non-null   object  
 8   fare       1046 non-null   float64 
 9   cabin      235 non-null    object  
 10  embarked   1045 non-null   category
 11  boat       397 non-null    object  
 12  body       100 non-null    float64 
 13  home.dest  591 non-null    object  
dtypes: category(3), float64(3), int64(3), object(5)
memory usage: 101.6+ KB


In [4]:
no_missing = list(titanic_train.isna().sum())
dtype = titanic_train.dtypes.tolist()
missing_df = pd.DataFrame(
  {
    'column':titanic_train.columns, 
    '# missing':no_missing, 
    'dtype':dtype}
).sort_values(by='# missing', ascending=False)
print(missing_df)

       column  # missing     dtype
12       body        947   float64
9       cabin        812    object
11       boat        650    object
13  home.dest        456    object
4         age        209   float64
10   embarked          2  category
8        fare          1   float64
0      pclass          0     int64
1    survived          0  category
2        name          0    object
3         sex          0  category
5       sibsp          0     int64
6       parch          0     int64
7      ticket          0    object


In [5]:
# Simple inputer fit
num_imputer = SimpleImputer(strategy='mean')
titanic_num = titanic_train.select_dtypes(include=[np.number])
num_imputer.fit(titanic_num)

In [6]:
print(f"Imputer: {list(num_imputer.statistics_)}")
print(f"Calculated: {titanic_num.mean().to_list()}")

Imputer: [2.2951289398280803, 29.68963806682578, 0.5023877745940784, 0.39255014326647564, 32.89519130019121, 164.21]
Calculated: [2.2951289398280803, 29.68963806682578, 0.5023877745940784, 0.39255014326647564, 32.89519130019121, 164.21]


In [7]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(titanic_train) # learn parameters for the imputer
X_inp = imputer.transform(titanic_train)
titanic_inp = pd.DataFrame(X_inp, columns=titanic_train.columns, index=titanic_train.index)
titanic_inp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1047 entries, 448 to 1210
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1047 non-null   object
 1   survived   1047 non-null   object
 2   name       1047 non-null   object
 3   sex        1047 non-null   object
 4   age        1047 non-null   object
 5   sibsp      1047 non-null   object
 6   parch      1047 non-null   object
 7   ticket     1047 non-null   object
 8   fare       1047 non-null   object
 9   cabin      1047 non-null   object
 10  embarked   1047 non-null   object
 11  boat       1047 non-null   object
 12  body       1047 non-null   object
 13  home.dest  1047 non-null   object
dtypes: object(14)
memory usage: 122.7+ KB


## Preprocessing pipeline

In [7]:
log_pipeline = make_pipeline(
    KNNImputer(),
    FunctionTransformer(np.log1p, feature_names_out="one-to-one"),
    MinMaxScaler(feature_range=(0, 1)))

In [8]:
one_hot_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder())

In [9]:
ordinal_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=[['S', 'C', 'Q']]))

In [10]:
kmeans_pipeline = make_pipeline(
    KNNImputer(),
    MinMaxScaler(feature_range=(0, 1)))

In [11]:
default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    MinMaxScaler(feature_range=(0, 1)))

In [12]:
def column_sum(X):
    return X[:, [0]] + X[:, [1]]

def sum_name(function_transformer, feature_names_in):
    return ["sum"]  # feature names out

sum_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    FunctionTransformer(column_sum, feature_names_out=sum_name),
    MinMaxScaler(feature_range=(0, 1)))

In [13]:
preprocessing = ColumnTransformer([
    ("Relatives", sum_pipeline, ['parch', 'sibsp']),
    ("Log", log_pipeline, ["fare"]),
    ("One_hot", one_hot_pipeline, ["sex"]),
    ("Ordinal", ordinal_pipeline, ["embarked"]),
    ("Numeric", default_num_pipeline, ['sibsp', 'parch']),
    ("KNN", kmeans_pipeline, ['age']),
    ("Pass", "passthrough", ['pclass', 'survived'])
],
remainder="drop"
)

preprocessing

In [14]:
# Apply the preprocessing pipeline to the training data``
X_train = preprocessing.fit_transform(titanic_train)
X_test = preprocessing.fit_transform(titanic_test)

In [15]:
# Show X_train as DataFrame with column names from the pipeline
X_train_df = pd.DataFrame(X_train, columns=preprocessing.get_feature_names_out())
X_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1047 entries, 0 to 1046
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Relatives__sum       1047 non-null   object
 1   Log__fare            1047 non-null   object
 2   One_hot__sex_female  1047 non-null   object
 3   One_hot__sex_male    1047 non-null   object
 4   Ordinal__embarked    1047 non-null   object
 5   Numeric__sibsp       1047 non-null   object
 6   Numeric__parch       1047 non-null   object
 7   KNN__age             1047 non-null   object
 8   Pass__pclass         1047 non-null   object
 9   Pass__survived       1047 non-null   object
dtypes: object(10)
memory usage: 81.9+ KB
