# Scikit-learn Pre-processing

In [1]:
import sklearn
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn import set_config

#Sets output of transform and fit_transform to pandas dataframe output.
set_config(transform_output = "pandas")

## Scaling

In [2]:
tiny_data = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])

scaler = StandardScaler().fit(tiny_data)
scaler

In [3]:
scaler.mean_

array([1.        , 0.        , 0.33333333])

In [4]:
scaler.scale_

array([0.81649658, 0.81649658, 1.24721913])

In [5]:
X_scaled = scaler.transform(tiny_data)

In [6]:
X_scaled.mean()

x0    0.0
x1    0.0
x2    0.0
dtype: float64

In [7]:
X_scaled.std()

x0    1.224745
x1    1.224745
x2    1.224745
dtype: float64

## Ordinal Encoding

In [8]:
# example of a ordinal encoding
from numpy import asarray

In [9]:
# define data
data = asarray([['data'], ['wrangling'], ['rocks']])
print(data)

[['data']
 ['wrangling']
 ['rocks']]


In [10]:
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
encoder.fit_transform(data)

Unnamed: 0,x0
0,0.0
1,2.0
2,1.0


## One Hot Encoding

In [11]:
# define one hot encoding
encoder = OneHotEncoder(sparse_output=False)
# transform data
encoder.fit_transform(data)

Unnamed: 0,x0_data,x0_rocks,x0_wrangling
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0


## Imputing missing values

In [35]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True, parser='auto')

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [36]:
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [37]:
y_train.head()

221     0
1245    0
1274    0
1136    0
1305    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']

In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   name       1309 non-null   object  
 2   sex        1309 non-null   category
 3   age        1046 non-null   float64 
 4   sibsp      1309 non-null   int64   
 5   parch      1309 non-null   int64   
 6   ticket     1309 non-null   object  
 7   fare       1308 non-null   float64 
 8   cabin      295 non-null    object  
 9   embarked   1307 non-null   category
 10  boat       486 non-null    object  
 11  body       121 non-null    float64 
 12  home.dest  745 non-null    object  
dtypes: category(2), float64(3), int64(3), object(5)
memory usage: 115.4+ KB


In [39]:
missing = X_test.isnull().sum()
missing = missing[missing > 0].sort_values(ascending = False)

In [40]:
missing

body         291
cabin        253
boat         208
home.dest    141
age           62
embarked       1
dtype: int64

In [41]:
simple_imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
simple_imputed = simple_imp.fit_transform(X_test[['age', 'body']])

In [42]:
#Replace the age and body columns in the original X_Test dataframe
#with the imputed values
X_test[['age', 'body']] = simple_imputed

#Repeat the above code to get number of NA values- note how the 'age' and 
#'body' columns disappear
missing = X_test.isnull().sum()
missing = missing[missing > 0].sort_values(ascending = False)
missing

cabin        253
boat         208
home.dest    141
embarked       1
dtype: int64

## Putting it all together

In [43]:
X_train.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
221,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C,,234.0,"Providence, RI"
1245,3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S,,,
1274,3,"Vander Planke, Mr. Julius",male,31.0,3,0,345763,18.0,,S,,,
1136,3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S,,,
1305,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,


In [44]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

# Here we use `StandardScaler` for continuous variables; 
# then we impute for missing data (check the documentation for the imputation method)
# We use `OneHotEncoder` for categorical variables
# NOTE: we are using a subset of the features (not all the columns)

ct = make_column_transformer((make_pipeline(SimpleImputer(),
                                            StandardScaler()), ["age", "fare"]),
                             (OneHotEncoder(sparse_output=False), ["embarked", "sex", "pclass"]), 
                             verbose_feature_names_out=False)

# Note: click on pipeline elements to see more details
clf = make_pipeline(ct, LogisticRegression())
clf

In [45]:
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.7828746177370031

In [46]:
# Let's remove the last step in the pipeline (which is LogisticRegression()) & transform the X_test data
clf[:-1].transform(X_test)

Unnamed: 0,age,fare,embarked_C,embarked_Q,embarked_S,embarked_nan,sex_female,sex_male,pclass_1,pclass_2,pclass_3
697,0.043169,-0.488170,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
213,0.123980,1.602394,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
493,0.123980,0.078208,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1056,-1.411443,-0.436627,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
268,-0.441702,0.537753,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
383,-0.684137,-0.431465,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1268,0.131981,-0.471433,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
190,-0.684137,0.896629,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
835,0.131981,-0.500410,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
