# Feature Matrix Preparation

# <img src="../references//images/Titanic_2.png" width="800"/>

In [190]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn import set_config




## Create a feature matrix and target column

In [191]:
data = pd.read_csv('../data/interim/strat_train.csv')
test_data = pd.read_csv('../data/interim/strat_test.csv')

train_data, dev_data = train_test_split(
    data,
    test_size=0.2,
    stratify=data['Pclass'],
    random_state=42
)

titanic_train = train_data.copy()
titanic_dev = dev_data.copy()

print(f"Train set has dimensions {titanic_train.shape}")
print(f"Dev set has dimensions {titanic_dev.shape} ")
print('\n------------------------------------------\n')
titanic_train.info()

Train set has dimensions (569, 14)
Dev set has dimensions (143, 14) 

------------------------------------------

<class 'pandas.core.frame.DataFrame'>
Index: 569 entries, 394 to 568
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Survived       569 non-null    int64  
 1   Pclass         569 non-null    int64  
 2   Sex            569 non-null    int64  
 3   Age            569 non-null    float64
 4   SibSp          569 non-null    int64  
 5   Parch          569 non-null    int64  
 6   Fare           569 non-null    float64
 7   Embarked       569 non-null    int64  
 8   Title          569 non-null    int64  
 9   FamilySize     569 non-null    int64  
 10  IsAlone        569 non-null    int64  
 11  FarePerPerson  569 non-null    float64
 12  AgeGroup       569 non-null    int64  
 13  FareGroup      569 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 66.7 KB


In [192]:
titanic_train.head(20)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,FarePerPerson,AgeGroup,FareGroup
394,1,3,0,21.773973,0,0,7.8792,1,8,1,1,7.8792,3,1
634,1,2,0,29.0,1,0,26.0,2,12,2,0,13.0,3,0
455,0,1,1,37.0,0,1,29.7,0,11,2,0,14.85,0,0
178,0,3,1,24.0,0,0,7.05,2,11,1,1,7.05,3,1
584,0,2,1,23.0,0,0,13.0,2,11,1,1,13.0,3,2
597,1,3,0,29.0,0,2,15.2458,0,12,3,0,5.081933,3,2
490,0,2,1,30.0,0,0,13.0,2,11,1,1,13.0,3,2
361,0,3,0,18.0,1,0,17.8,2,12,2,0,8.9,1,2
668,1,1,0,24.0,0,0,49.5042,0,9,1,1,49.5042,3,3
442,1,2,1,1.0,2,1,39.0,2,7,4,0,9.75,1,3


In [193]:
columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

for column in columns:
    print(column)
    print(titanic_train[column].unique())

Pclass
[3 2 1]
Sex
[0 1]
SibSp
[0 1 2 4 3 5 8]
Parch
[0 1 2 5 4 3]
Embarked
[1 2 0]


## Pipelines

### Missing values

First, we have to deal with missing values. Even though in the training dataset we do not have too many missing values, this might be different in the samples for predictions. That is why we have to find a strategy to deal with missing values in all columns.

In [194]:
no_missing = list(titanic_train.isna().sum())
dtype = titanic_train.dtypes.tolist()
missing_df = pd.DataFrame(
  {
    'column':titanic_train.columns, 
    '# missing':no_missing, 
    'dtype':dtype}
).sort_values(by='# missing', ascending=False)
print(missing_df)

           column  # missing    dtype
0        Survived          0    int64
1          Pclass          0    int64
2             Sex          0    int64
3             Age          0  float64
4           SibSp          0    int64
5           Parch          0    int64
6            Fare          0  float64
7        Embarked          0    int64
8           Title          0    int64
9      FamilySize          0    int64
10        IsAlone          0    int64
11  FarePerPerson          0  float64
12       AgeGroup          0    int64
13      FareGroup          0    int64


In [195]:
# Simple inputer fit
num_imputer = SimpleImputer(strategy='mean')
titanic_num = titanic_train.select_dtypes(include=[np.number])
num_imputer.fit(titanic_num)

In [196]:
print(f"Imputer: {list(num_imputer.statistics_)}")
print(f"Calculated: {titanic_num.mean().to_list()}")

Imputer: [0.39191564147627417, 2.3075571177504393, 0.655536028119508, 29.227557106391806, 0.5307557117750439, 0.38137082601054484, 33.20832583479789, 1.5254833040421794, 10.219683655536029, 1.9121265377855887, 0.5992970123022847, 20.505120262212888, 1.383128295254833, 1.6854130052724077]
Calculated: [0.39191564147627417, 2.3075571177504393, 0.655536028119508, 29.227557106391806, 0.5307557117750439, 0.38137082601054484, 33.20832583479789, 1.5254833040421794, 10.219683655536029, 1.9121265377855887, 0.5992970123022847, 20.505120262212888, 1.383128295254833, 1.6854130052724077]


In [197]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(titanic_train) # learn parameters for the imputer
X_inp = imputer.transform(titanic_train)
titanic_inp = pd.DataFrame(X_inp, columns=titanic_train.columns, index=titanic_train.index)
titanic_inp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 569 entries, 394 to 568
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Survived       569 non-null    float64
 1   Pclass         569 non-null    float64
 2   Sex            569 non-null    float64
 3   Age            569 non-null    float64
 4   SibSp          569 non-null    float64
 5   Parch          569 non-null    float64
 6   Fare           569 non-null    float64
 7   Embarked       569 non-null    float64
 8   Title          569 non-null    float64
 9   FamilySize     569 non-null    float64
 10  IsAlone        569 non-null    float64
 11  FarePerPerson  569 non-null    float64
 12  AgeGroup       569 non-null    float64
 13  FareGroup      569 non-null    float64
dtypes: float64(14)
memory usage: 66.7 KB


In [198]:
def column_sum(X):
    return X[:, [0]] + X[:, [1]]

def sum_name(function_transformer, feature_names_in):
    return ["sum"]  # feature names out

In [199]:
log_pipeline = make_pipeline(
    KNNImputer(),
    FunctionTransformer(np.log1p, feature_names_out="one-to-one"),
    MinMaxScaler(feature_range=(0, 1)))

In [200]:
one_hot_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder())

In [201]:
ordinal_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(categories=[['S', 'C', 'Q']]))

In [202]:
kmeans_pipeline = make_pipeline(
    KNNImputer(),
    MinMaxScaler(feature_range=(0, 1)))

In [203]:
default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    MinMaxScaler(feature_range=(0, 1)))

In [204]:
sum_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    FunctionTransformer(column_sum, feature_names_out=sum_name),
    MinMaxScaler(feature_range=(0, 1)))

In [205]:
titanic_inp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 569 entries, 394 to 568
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Survived       569 non-null    float64
 1   Pclass         569 non-null    float64
 2   Sex            569 non-null    float64
 3   Age            569 non-null    float64
 4   SibSp          569 non-null    float64
 5   Parch          569 non-null    float64
 6   Fare           569 non-null    float64
 7   Embarked       569 non-null    float64
 8   Title          569 non-null    float64
 9   FamilySize     569 non-null    float64
 10  IsAlone        569 non-null    float64
 11  FarePerPerson  569 non-null    float64
 12  AgeGroup       569 non-null    float64
 13  FareGroup      569 non-null    float64
dtypes: float64(14)
memory usage: 66.7 KB


In [206]:
# Convert the columns of titanic_inp to an integer
titanic_inp.astype(int)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,FarePerPerson,AgeGroup,FareGroup
394,1,3,0,21,0,0,7,1,8,1,1,7,3,1
634,1,2,0,29,1,0,26,2,12,2,0,13,3,0
455,0,1,1,37,0,1,29,0,11,2,0,14,0,0
178,0,3,1,24,0,0,7,2,11,1,1,7,3,1
584,0,2,1,23,0,0,13,2,11,1,1,13,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,1,1,0,29,0,0,211,2,8,1,1,211,3,4
362,1,3,0,21,0,0,7,1,8,1,1,7,3,1
139,0,1,1,32,0,0,30,0,11,1,1,30,0,3
472,1,1,0,58,0,0,26,2,8,1,1,26,2,0


In [207]:
titanic_inp.isnull().sum()

Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
Title            0
FamilySize       0
IsAlone          0
FarePerPerson    0
AgeGroup         0
FareGroup        0
dtype: int64

In [208]:
titanic_test = test_data.copy()


In [209]:
# Check for empty columns before fitting the SimpleImputer
titanic_train.isna().sum()


Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
Title            0
FamilySize       0
IsAlone          0
FarePerPerson    0
AgeGroup         0
FareGroup        0
dtype: int64

In [210]:
titanic_train.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,FarePerPerson,AgeGroup,FareGroup
394,1,3,0,21.773973,0,0,7.8792,1,8,1,1,7.8792,3,1
634,1,2,0,29.0,1,0,26.0,2,12,2,0,13.0,3,0
455,0,1,1,37.0,0,1,29.7,0,11,2,0,14.85,0,0
178,0,3,1,24.0,0,0,7.05,2,11,1,1,7.05,3,1
584,0,2,1,23.0,0,0,13.0,2,11,1,1,13.0,3,2
597,1,3,0,29.0,0,2,15.2458,0,12,3,0,5.081933,3,2
490,0,2,1,30.0,0,0,13.0,2,11,1,1,13.0,3,2
361,0,3,0,18.0,1,0,17.8,2,12,2,0,8.9,1,2
668,1,1,0,24.0,0,0,49.5042,0,9,1,1,49.5042,3,3
442,1,2,1,1.0,2,1,39.0,2,7,4,0,9.75,1,3


In [211]:
# Convert the columns of titanic_train to an integer
titanic_train.astype(int)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,FarePerPerson,AgeGroup,FareGroup
394,1,3,0,21,0,0,7,1,8,1,1,7,3,1
634,1,2,0,29,1,0,26,2,12,2,0,13,3,0
455,0,1,1,37,0,1,29,0,11,2,0,14,0,0
178,0,3,1,24,0,0,7,2,11,1,1,7,3,1
584,0,2,1,23,0,0,13,2,11,1,1,13,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,1,1,0,29,0,0,211,2,8,1,1,211,3,4
362,1,3,0,21,0,0,7,1,8,1,1,7,3,1
139,0,1,1,32,0,0,30,0,11,1,1,30,0,3
472,1,1,0,58,0,0,26,2,8,1,1,26,2,0


In [212]:
print("Columns in the dataset:")
print(titanic_train.columns)

Columns in the dataset:
Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked', 'Title', 'FamilySize', 'IsAlone', 'FarePerPerson',
       'AgeGroup', 'FareGroup'],
      dtype='object')


In [213]:
# Check for empty columns
print("Checking for empty columns in the training data:")
for name, pipeline, columns in preprocessing.transformers:
    if isinstance(columns, list):
        data = titanic_train[columns]
        if data.empty:
            print(f"Empty data found for transformer '{name}' with columns {columns}")
        else:
            print(f"Data for transformer '{name}' with columns {columns} is not empty")

# Fit and transform the data
try:
    X_train = preprocessing.fit_transform(titanic_train)
    print("Transformation successful")
except ValueError as e:
    print(f"Error during transformation: {e}")

Checking for empty columns in the training data:
Data for transformer 'Relatives' with columns ['Parch', 'SibSp'] is not empty
Data for transformer 'Log' with columns ['Fare'] is not empty
Data for transformer 'One_hot' with columns ['Sex'] is not empty
Data for transformer 'Ordinal' with columns ['Embarked'] is not empty
Data for transformer 'Numeric' with columns ['SibSp', 'Parch'] is not empty
Data for transformer 'KNN' with columns ['Age'] is not empty
Data for transformer 'Pass' with columns ['Pclass', 'Survived'] is not empty
Transformation successful


In [214]:
# Inspect the values in the 'Embarked' column
print("Unique values in the 'Embarked' column:")
print(titanic_train['Embarked'].unique())

# Check for empty columns and inspect data being passed to each transformer
print("Checking for empty columns in the training data and inspecting data:")
for name, pipeline, columns in preprocessing.transformers:
    if isinstance(columns, list):
        data = titanic_train[columns]
        if data.empty:
            print(f"Empty data found for transformer '{name}' with columns {columns}")
        else:
            print(f"Data for transformer '{name}' with columns {columns} is not empty")
            print(f"First few rows of data for transformer '{name}':")
            print(data.head())

# Fit and transform the data
try:
    X_train = preprocessing.fit_transform(titanic_train)
    print("Transformation successful")
except ValueError as e:
    print(f"Error during transformation: {e}")

Unique values in the 'Embarked' column:
[1 2 0]
Checking for empty columns in the training data and inspecting data:
Data for transformer 'Relatives' with columns ['Parch', 'SibSp'] is not empty
First few rows of data for transformer 'Relatives':
     Parch  SibSp
394      0      0
634      0      1
455      1      0
178      0      0
584      0      0
Data for transformer 'Log' with columns ['Fare'] is not empty
First few rows of data for transformer 'Log':
        Fare
394   7.8792
634  26.0000
455  29.7000
178   7.0500
584  13.0000
Data for transformer 'One_hot' with columns ['Sex'] is not empty
First few rows of data for transformer 'One_hot':
     Sex
394    0
634    0
455    1
178    1
584    1
Data for transformer 'Ordinal' with columns ['Embarked'] is not empty
First few rows of data for transformer 'Ordinal':
     Embarked
394         1
634         2
455         0
178         2
584         2
Data for transformer 'Numeric' with columns ['SibSp', 'Parch'] is not empty
First few 

In [215]:
titanic_train['Title'].unique()

array([ 8, 12, 11,  9,  7, 10, 14,  2,  0,  1,  5,  3, 16,  4, 15,  6],
      dtype=int64)

In [216]:
# Inspect the data types of the columns
print("Data types of the columns in the dataset:")
print(titanic_train.dtypes)


Data types of the columns in the dataset:
Survived           int64
Pclass             int64
Sex                int64
Age              float64
SibSp              int64
Parch              int64
Fare             float64
Embarked           int64
Title              int64
FamilySize         int64
IsAlone            int64
FarePerPerson    float64
AgeGroup           int64
FareGroup          int64
dtype: object


In [217]:
# Define individual transformers for testing
transformers = [
    ("Relatives", sum_pipeline, ['Parch', 'SibSp']),
    ("Log", log_pipeline, ["Fare"]),
    ("One_hot", one_hot_pipeline, ["Sex"]),
    ("Ordinal", ordinal_pipeline, ["Embarked"]),
    ("Numeric", default_num_pipeline, ['SibSp', 'Parch']),
    ("KNN", kmeans_pipeline, ['Age']),
    ("Pass", "passthrough", ['Pclass', 'Survived'])
]
transformers

# Apply each transformer individually
for name, pipeline, columns in transformers:
    print(f"Applying transformer '{name}' with columns {columns}")
    try:
        if pipeline == "passthrough":
            print(f"Transformer '{name}' is passthrough, skipping transformation")
            transformed_data = titanic_train[columns]
        else:
            if isinstance(columns, list):
                data = titanic_train[columns]
                transformed_data = pipeline.fit_transform(data)
                print(f"Transformation successful for transformer '{name}'")
    except ValueError as e:
        print(f"Error during transformation for transformer '{name}': {e}")
    except Exception as e:
        print(f"Unexpected error during transformation for transformer '{name}': {e}")

Applying transformer 'Relatives' with columns ['Parch', 'SibSp']
Transformation successful for transformer 'Relatives'
Applying transformer 'Log' with columns ['Fare']
Transformation successful for transformer 'Log'
Applying transformer 'One_hot' with columns ['Sex']
Transformation successful for transformer 'One_hot'
Applying transformer 'Ordinal' with columns ['Embarked']
Error during transformation for transformer 'Ordinal': invalid literal for int() with base 10: 'S'
Applying transformer 'Numeric' with columns ['SibSp', 'Parch']
Transformation successful for transformer 'Numeric'
Applying transformer 'KNN' with columns ['Age']
Transformation successful for transformer 'KNN'
Applying transformer 'Pass' with columns ['Pclass', 'Survived']
Transformer 'Pass' is passthrough, skipping transformation


In [218]:
# import pipeline
from sklearn.pipeline import Pipeline

# Example definition of ordinal_pipeline
ordinal_pipeline = Pipeline([
    ('ordinal', OrdinalEncoder())
])

# Inspect the values in the 'Embarked' column
print("Unique values in the 'Embarked' column:")
print(titanic_train['Embarked'].unique())

# Apply the ordinal_pipeline to the 'Embarked' column
try:
    transformed_embarked = ordinal_pipeline.fit_transform(titanic_train[['Embarked']])
    print("Transformation successful for 'Embarked' column")
    print("Transformed 'Embarked' values:")
    print(transformed_embarked[:10])
except ValueError as e:
    print(f"Error during transformation for 'Embarked' column: {e}")
except Exception as e:
    print(f"Unexpected error during transformation for 'Embarked' column: {e}")

Unique values in the 'Embarked' column:
[1 2 0]
Transformation successful for 'Embarked' column
Transformed 'Embarked' values:
[[1.]
 [2.]
 [0.]
 [2.]
 [2.]
 [0.]
 [2.]
 [2.]
 [0.]
 [2.]]


In [221]:
# Define the ordinal_pipeline
ordinal_pipeline = Pipeline([
    ('ordinal', OrdinalEncoder())
])

# Define the ColumnTransformer
preprocessing = ColumnTransformer([
    ("Relatives", sum_pipeline, ['Parch', 'SibSp']),
    ("Log", log_pipeline, ["Fare"]),
    ("One_hot", one_hot_pipeline, ["Sex"]),
    ("Ordinal", ordinal_pipeline, ["Embarked"]),
    ("Numeric", default_num_pipeline, ['SibSp', 'Parch']),
    ("KNN", kmeans_pipeline, ['Age']),
    ("Pass", "passthrough", ['Pclass', 'Survived'])
], 

remainder="drop")

preprocessing


In [222]:
# Transform the development and test datasets
try:
    X_train = preprocessing.fit_transform(titanic_train)
    X_dev = preprocessing.transform(titanic_dev)
    X_test = preprocessing.transform(titanic_test)
    print("Transformation successful for development and test datasets")
except ValueError as e:
    print(f"Error during transformation: {e}")
except Exception as e:
    print(f"Unexpected error during transformation: {e}")

# Save the processed data
titanic_train.to_csv('../data/processed/titanic_train.csv', index=False)
titanic_dev.to_csv('../data/processed/titanic_dev.csv', index=False)
titanic_test.to_csv('../data/processed/titanic_test.csv', index=False)

Transformation successful for development and test datasets
