In [1]:
import sklearn
import pandas as pd
sklearn.__version__

'1.1.1'

In [2]:
# load titanic dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/titanic.csv'
titanic = pd.read_csv(url ,index_col = 'PassengerId')
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


$\textbf{Features:}$
- Survived: 0 = No; 1 = Yes
- Pclass: Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
- Name: Name
- Sex: Sex
- Age: Age
- SibSp: Number of Siblings/Spouses Aboard
- Parch: Number of Parents/Children Aboard
- Ticket: Ticket Number
- Fare: Passenger Fare
- Cabin: Cabin
- Embarked: Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

$\textbf{Categorical features:}$ sex, ticket, cabin, embarked.

$\textbf{Numerical features:}$ Pclass, Age, SibSp, Parch, Fare

In [3]:
titanic.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

## Ordinal Encoding

features can be efficiently coded as integers. To convert categorical features to such integer codes, we can use the OrdinalEncoder.

In [4]:
from sklearn.preprocessing import OrdinalEncoder as oe

In [5]:
encoder = oe()

In [6]:
# ordingal encoding of the "Sex" feature  
titanic.Sex.unique()

array(['male', 'female'], dtype=object)

In [7]:
encoder.fit(titanic[['Sex']])
encoder.transform(titanic[['Sex']])[:10] 

array([[1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.]])

In [8]:
encoder.categories_ #female:0, male:1

[array(['female', 'male'], dtype=object)]

In [9]:
#Ordinal encoding assumes that there is an orderig of the categories. We can change the "order"
genders = ['male','female']
encoder = oe(categories=[genders])
encoder.fit_transform(titanic[['Sex']])[:10] 

array([[0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.]])

In [10]:
# ordinal encoding of the "Embarked" feature
titanic.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [11]:
titanic.Embarked.isna().sum()

2

In [14]:
encoder = oe(handle_unknown='use_encoded_value', unknown_value=5)
encoder.fit(titanic[['Embarked']])
encoder.transform(titanic[['Embarked']])

array([[2.],
       [0.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [0.],
       [2.],
       [1.],
       [2.],
       [0.],
       [0.],
       [1.],
       [2.],
       [0.],
       [2.],
       [0.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [0.],
       [0.],
       [1.],
       [2.],
       [1.],
       [1.],
       [0.],
       [2.],
       [2.],
       [2.],
       [0.],
       [2.],
       [0.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [0.],
       [0.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [0.],
       [2.],
       [2.],
       [2.],
       [2.],

In [15]:
# unknown values
new_data = pd.DataFrame(data=['C','C','S','Q','The Moon'],columns=['Embarked'])
new_data

Unnamed: 0,Embarked
0,C
1,C
2,S
3,Q
4,The Moon


In [16]:
encoder.transform(new_data)

array([[0.],
       [0.],
       [2.],
       [1.],
       [5.]])

In [17]:
# # ordinal encoding of the "Sex" and "Embarked" features
encoder.fit(titanic[['Sex','Embarked']])
encoder.transform(titanic[['Sex','Embarked']])

array([[1., 2.],
       [0., 0.],
       [0., 2.],
       ...,
       [0., 2.],
       [1., 0.],
       [1., 1.]])

In [18]:
encoder.categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

## One hot  Encoding

In [19]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False) #set sparse to False if you want to visualize the encoding 

In [23]:
encoder.fit_transform(titanic[['Sex']])

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [24]:
encoder.categories_

[array(['female', 'male'], dtype=object)]

In [21]:
encoder.fit_transform(titanic[['Embarked']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [22]:
encoder.categories_

[array(['C', 'Q', 'S'], dtype=object)]

In [25]:
# one hot encoding of the "Sex" and "Embarked" features
encoder.fit_transform(titanic[['Sex','Embarked']])

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [26]:
encoder.categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

## Normalization/Standardization (Scaling)

- min-max scaling
    - value $-->$ $\frac{\text{value - min value}}{\text{max value - min value}} \in [0,1]$
- max-abs scaling 
    - value $--> \frac{\text{value}}{|\text{max value}|} \in [-1,1]$
- standard scaling 
    - value $--> \frac{\text{value - mean value}}{\text{standard deviation}}$

In [None]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler

In [27]:
X = titanic[['Pclass','Age','Fare']] # feature matrix
X

Unnamed: 0_level_0,Pclass,Age,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3,22.0,7.2500
2,1,38.0,71.2833
3,3,26.0,7.9250
4,1,35.0,53.1000
5,3,35.0,8.0500
...,...,...,...
887,2,27.0,13.0000
888,1,19.0,30.0000
889,3,,23.4500
890,1,26.0,30.0000


In [None]:
scaler = MinMaxScaler()
scaler.fit_transform(titanic[['Age','Fare']])

### Normalization 

Normalization is the process of converting an actual range of values which a numerical feature can take, into a standard range of values, typically in the interval [-1,1] or [0,1]. This can be achieved using MinMaxScaler or MaxAbsScaler, respectively.

Normalizing the data is not a strict requirement. However, in practice, it can lead to an increased speed of training.

In [28]:
# min-max scaler scales data to the [0, 1] range
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
scaler.transform(X)

array([[1.        , 0.27117366, 0.01415106],
       [0.        , 0.4722292 , 0.13913574],
       [1.        , 0.32143755, 0.01546857],
       ...,
       [1.        ,        nan, 0.04577135],
       [0.        , 0.32143755, 0.0585561 ],
       [1.        , 0.39683338, 0.01512699]])

In [29]:
# max-abs scaler scales data to the [-1, 1] range 
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
scaler.fit(X)
scaler.transform(X)

array([[1.        , 0.275     , 0.01415106],
       [0.33333333, 0.475     , 0.13913574],
       [1.        , 0.325     , 0.01546857],
       ...,
       [1.        ,        nan, 0.04577135],
       [0.33333333, 0.325     , 0.0585561 ],
       [1.        , 0.4       , 0.01512699]])

### Standardization 

Standardization (or mean removal and variance scaling) is the procedure during which the feature values are rescaled so that they have the properties of a standard normal distribution with mean 0 and standard deviation 1.

In [30]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
scaler.transform(X)

array([[ 0.82520863, -0.52766856, -0.50023975],
       [-1.57221121,  0.57709388,  0.78894661],
       [ 0.82520863, -0.25147795, -0.48664993],
       ...,
       [ 0.82520863,         nan, -0.17408416],
       [-1.57221121, -0.25147795, -0.0422126 ],
       [ 0.82520863,  0.16280796, -0.49017322]])

In [42]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(X)
imputed_X = imputer.transform(X)
imputed_X

array([[ 3.       , 22.       ,  7.25     ],
       [ 1.       , 38.       , 71.2833   ],
       [ 3.       , 26.       ,  7.925    ],
       ...,
       [ 3.       , 29.6420927, 23.45     ],
       [ 1.       , 26.       , 30.       ],
       [ 3.       , 32.       ,  7.75     ]])

## Data Imputation 

In [31]:
X.isnull().sum()

Pclass      0
Age       177
Fare        0
dtype: int64

In [34]:
from sklearn.impute import SimpleImputer

In [35]:
titanic.Age.mean()

29.64209269662921

In [36]:
# missing values can be imputed with: a constant value, mean, median, most frequent, etc. 
imputer = SimpleImputer(strategy='mean')
imputer.fit_transform(titanic[['Age']])

array([[22.       ],
       [38.       ],
       [26.       ],
       [35.       ],
       [35.       ],
       [29.6420927],
       [54.       ],
       [ 2.       ],
       [27.       ],
       [14.       ],
       [ 4.       ],
       [58.       ],
       [20.       ],
       [39.       ],
       [14.       ],
       [55.       ],
       [ 2.       ],
       [29.6420927],
       [31.       ],
       [29.6420927],
       [35.       ],
       [34.       ],
       [15.       ],
       [28.       ],
       [ 8.       ],
       [38.       ],
       [29.6420927],
       [19.       ],
       [29.6420927],
       [29.6420927],
       [40.       ],
       [29.6420927],
       [29.6420927],
       [66.       ],
       [28.       ],
       [42.       ],
       [29.6420927],
       [21.       ],
       [18.       ],
       [14.       ],
       [40.       ],
       [27.       ],
       [29.6420927],
       [ 3.       ],
       [19.       ],
       [29.6420927],
       [29.6420927],
       [29.64

In [37]:
titanic.Age.mean()

29.64209269662921

In [38]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(titanic[['Embarked']])

array([['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['S'],
       ['C'],
       ['C'],
       ['Q'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['Q'],
       ['S'],
       ['Q'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
      

## Polynomial features

Often it's useful to add complexity to the model by considering nonlinear features of the input data. A simple and common method to use is polynomial features, which can get features' high-order and interaction terms. It is implemented in PolynomialFeatures

In [40]:
from sklearn.preprocessing import PolynomialFeatures

In [43]:
poly = PolynomialFeatures(degree=2)
poly.fit(imputed_X)
poly.transform(imputed_X)

array([[1.00000000e+00, 3.00000000e+00, 2.20000000e+01, ...,
        4.84000000e+02, 1.59500000e+02, 5.25625000e+01],
       [1.00000000e+00, 1.00000000e+00, 3.80000000e+01, ...,
        1.44400000e+03, 2.70876540e+03, 5.08130886e+03],
       [1.00000000e+00, 3.00000000e+00, 2.60000000e+01, ...,
        6.76000000e+02, 2.06050000e+02, 6.28056250e+01],
       ...,
       [1.00000000e+00, 3.00000000e+00, 2.96420927e+01, ...,
        8.78653659e+02, 6.95107074e+02, 5.49902500e+02],
       [1.00000000e+00, 1.00000000e+00, 2.60000000e+01, ...,
        6.76000000e+02, 7.80000000e+02, 9.00000000e+02],
       [1.00000000e+00, 3.00000000e+00, 3.20000000e+01, ...,
        1.02400000e+03, 2.48000000e+02, 6.00625000e+01]])