In [23]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder

train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

# Get rid of columns that are not useful
train_data.drop(columns= ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)
test_data.drop(columns= ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)

# Seperate X and y
# store feature matrix in "X"
X = train_data.iloc[:, 1:]   
# store response vector in "y"
y = train_data.iloc[:,0]    

#converting integer classes to Letters and prepare for One Hot Encoding
X['Pclass'] = X['Pclass'].map({1: 'AC', 2: 'BC', 3 : 'CC'})
data_tr = X
X.info(),X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    object 
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 48.9+ KB


(None,
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
 0     CC    male  22.0      1      0   7.2500        S
 1     AC  female  38.0      1      0  71.2833        C
 2     CC  female  26.0      0      0   7.9250        S
 3     AC  female  35.0      1      0  53.1000        S
 4     CC    male  35.0      0      0   8.0500        S)

In [24]:
## Handling missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

# Creating pipeline of the tasks

In [25]:
## Using Pipeline class
from sklearn.pipeline import Pipeline
## Using StandardScaler to scale all the numerical attributes
from sklearn.preprocessing import StandardScaler

numerics = ['float64', 'int64']
num_data = data_tr.select_dtypes(include=numerics)

## pipeline for numerical attributes
## imputing -> Scale them

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scalar', StandardScaler()),
])

num_data_tr = num_pipeline.fit_transform(num_data)
num_data.iloc[0],num_data_tr[0]

(Age      22.00
 SibSp     1.00
 Parch     0.00
 Fare      7.25
 Name: 0, dtype: float64,
 array([-0.56573646,  0.43279337, -0.47367361, -0.50244517]))

In [26]:
num_data.iloc[1],num_data_tr[1]

(Age      38.0000
 SibSp     1.0000
 Parch     0.0000
 Fare     71.2833
 Name: 1, dtype: float64,
 array([ 0.66386103,  0.43279337, -0.47367361,  0.78684529]))

# Transforming Numerical and Categorical Attributes

In [29]:
## Transform different columns or subsets using ColumnColumnTransformer
from sklearn.compose import ColumnTransformer

num_attrs = list(num_data)
cat_attrs = ['Pclass',"Sex","Embarked"]

## Complete pipeline to transform
## both Num and Cat attributes
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attrs),
    ("cat", OneHotEncoder(), cat_attrs),
])
prepared_data = full_pipeline.fit_transform(X)
prepared_data[0]

array([-0.56573646,  0.43279337, -0.47367361, -0.50244517,  0.        ,
        0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ])