# Tutorial 2 - Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, OneHotEncoder)
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
import seaborn as sns
df = sns.load_dataset('titanic')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
df.describe(include='all')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
count,891.0,891.0,891,714.0,891.0,891.0,891.0,889,891,891,891,203,889,891,891
unique,,,2,,,,,3,3,3,2,7,3,2,2
top,,,male,,,,,S,Third,man,True,C,Southampton,no,True
freq,,,577,,,,,644,491,537,537,59,644,549,537
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,,,,,,,,
std,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,,,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,,,
25%,0.0,2.0,,20.125,0.0,0.0,7.9104,,,,,,,,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,,,,,,,,
75%,1.0,3.0,,38.0,1.0,0.0,31.0,,,,,,,,


## 1. Train-test split

In [7]:
y = df['survived']
X = df.drop(['survived', 'alive'], axis=1) 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
331,1,male,45.5,0,0,28.5,S,First,man,True,C,Southampton,True
733,2,male,23.0,0,0,13.0,S,Second,man,True,,Southampton,True
382,3,male,32.0,0,0,7.925,S,Third,man,True,,Southampton,True
704,3,male,26.0,1,0,7.8542,S,Third,man,True,,Southampton,False
813,3,female,6.0,4,2,31.275,S,Third,child,False,,Southampton,False


In [10]:
X_test.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
709,3,male,,1,1,15.2458,C,Third,man,True,,Cherbourg,False
439,2,male,31.0,0,0,10.5,S,Second,man,True,,Southampton,True
840,3,male,20.0,0,0,7.925,S,Third,man,True,,Southampton,True
720,2,female,6.0,0,1,33.0,S,Second,child,False,,Southampton,False
39,3,female,14.0,1,0,11.2417,C,Third,child,False,,Cherbourg,False


In [11]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: survived, dtype: int64

## 2. Steps of Data Preprocessing Pipeline
- Dealing with missing values/outliers
- Encoding categorical variables
- Feature engineering
- Dimensionality reduction

In [12]:
# Example Pipeline

# Divide columns into categorical and numerical
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']
numerical_columns = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
print(categorical_columns)
print(numerical_columns)

['sex', 'embarked', 'who', 'embark_town']
['pclass', 'age', 'sibsp', 'parch', 'fare']


In [13]:
# Define the pipeline

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create pipeline
pipeline = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [14]:
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# Convert to DataFrame
X_train_transformed = pd.DataFrame(X_train_transformed)
X_test_transformed = pd.DataFrame(X_test_transformed)

# Add column names
X_train_transformed.columns = numerical_columns + list(pipeline.named_transformers_['cat']['onehot'].get_feature_names_out())
X_test_transformed.columns = numerical_columns + list(pipeline.named_transformers_['cat']['onehot'].get_feature_names_out())

In [15]:
print(f"Sample of transformed data:\n {X_train_transformed.head()}")

Sample of transformed data:
      pclass       age     sibsp     parch      fare  x0_female  x0_male  x1_C  \
0 -1.614136  1.253641 -0.470722 -0.479342 -0.078684        0.0      1.0   0.0   
1 -0.400551 -0.477284 -0.470722 -0.479342 -0.377145        0.0      1.0   0.0   
2  0.813034  0.215086 -0.470722 -0.479342 -0.474867        0.0      1.0   0.0   
3  0.813034 -0.246494  0.379923 -0.479342 -0.476230        0.0      1.0   0.0   
4  0.813034 -1.785093  2.931860  2.048742 -0.025249        1.0      0.0   0.0   

   x1_Q  x1_S  x2_child  x2_man  x2_woman  x3_Cherbourg  x3_Queenstown  \
0   0.0   1.0       0.0     1.0       0.0           0.0            0.0   
1   0.0   1.0       0.0     1.0       0.0           0.0            0.0   
2   0.0   1.0       0.0     1.0       0.0           0.0            0.0   
3   0.0   1.0       0.0     1.0       0.0           0.0            0.0   
4   0.0   1.0       1.0     0.0       0.0           0.0            0.0   

   x3_Southampton  
0             1.0  

In [17]:
X_train_transformed.describe(include='all')

Unnamed: 0,pclass,age,sibsp,parch,fare,x0_female,x0_male,x1_C,x1_Q,x1_S,x2_child,x2_man,x2_woman,x3_Cherbourg,x3_Queenstown,x3_Southampton
count,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0,712.0
mean,9.355812e-17,1.746418e-17,1.746418e-17,2.245395e-17,5.363999e-17,0.344101,0.655899,0.175562,0.08427,0.740169,0.09691,0.606742,0.296348,0.175562,0.08427,0.740169
std,1.000703,1.000703,1.000703,1.000703,1.000703,0.475408,0.475408,0.380714,0.277987,0.43885,0.296043,0.488817,0.456967,0.380714,0.277987,0.43885
min,-1.614136,-2.214363,-0.4707224,-0.4793416,-0.6274674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.4005512,-0.5542135,-0.4707224,-0.4793416,-0.474867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.8130337,-0.09263364,-0.4707224,-0.4793416,-0.3491435,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,0.8130337,0.4458762,0.3799232,-0.4793416,-0.04017244,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
max,0.8130337,3.907725,6.334442,7.104908,9.237724,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
X_train_transformed.isnull().sum()

pclass            0
age               0
sibsp             0
parch             0
fare              0
x0_female         0
x0_male           0
x1_C              0
x1_Q              0
x1_S              0
x2_child          0
x2_man            0
x2_woman          0
x3_Cherbourg      0
x3_Queenstown     0
x3_Southampton    0
dtype: int64

Check out documentation for more details: 
- [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)
- [ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)