# Feature Engineering



In [58]:
%matplotlib inline

import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from sklearn.datasets import fetch_california_housing
from sklearn.tree import export_graphviz
from sklearn.model_selection import GridSearchCV
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import graphviz
#import pydotplus

from IPython.display import Image
import warnings
warnings.filterwarnings("ignore")


# Increase viewable area of Pandas tables, numpy arrays, plots
#pd.set_option('max_rows', 15, 'max_columns', 500, 'max_colwidth', 1, 'precision', 2)
np.set_printoptions(linewidth=10000, precision=4, edgeitems=20, suppress=True)
sns.set()
plt.rcParams['figure.figsize'] = [16, 6]

### Data  - Titanic data set

Сегодня будем работать с данными о пассажирах Титаника (Titatic dataset).

In [59]:
#data = pd.read_csv('titanic.csv')
data = pd.read_csv('https://grantmlong.com/data/titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Выкинем колонки заведомо ненужные колонки.

In [60]:
cols_2_drop = ['PassengerId', 'Ticket', 'Cabin', 'Name']
data = data.drop(cols_2_drop, axis=1)


# Basic EDA

In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [62]:
data.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [63]:
data[pd.isnull(data.Embarked)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,1,1,female,38.0,0,0,80.0,
829,1,1,female,62.0,0,0,80.0,


Мы видим, что поле `Age` содержит много пропусков. Заполним пропуски для этого признака специальным значение `-999`. Остальные объекты, содержащие пропуски просто выкинем из рассмотрения.

In [64]:
data.loc[:, 'Age'] = data.Age.fillna(-999)
data = data.dropna()
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


В данных два категориальных признака:  **Sex** и **Embarked**. Воспользуемся изученными методами кодирования, чтобы их закодировать. Будет кодировать признаки разными способами и сравнивать качество для разных методов.

In [65]:
data[['Sex','Embarked']].nunique()

Sex         2
Embarked    3
dtype: int64

Зададим функцию, которая кодирует категориальные признаки выбранным методом.

In [66]:
def encode_func(data, enc, cols = ['Sex','Embarked']):
  data_enc = data.copy()
  data_enc[cols]= enc.fit_transform(data_enc[cols])
  return data_enc


Будем оценивать качество на алгоритме kNN.

Качество будем оценивать по F1 мере.

In [67]:
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

def evaluate_encoding(data_enc):
  knn = KNeighborsClassifier()
  rf = RandomForestClassifier(random_state = 123)
  lr = LogisticRegression(random_state = 123)
  scaler = StandardScaler()
  scaled_data = scaler.fit_transform(data_enc.drop('Survived', axis = 1))
  lr_res = np.mean(cross_val_score(lr, scaled_data, y = data_enc.Survived, cv = 5, scoring = 'f1'))
  knn_res = np.mean(cross_val_score(knn, scaled_data, y = data_enc.Survived, cv = 5, scoring = 'f1'))
  rf_res = np.mean(cross_val_score(rf, scaled_data, y = data_enc.Survived, cv = 5, scoring = 'f1'))
  print('F1 results')
  print('LogReg', round(lr_res,4))
  print('KNN', round(knn_res,4))
  print('RF', round(rf_res,4))
  return [lr_res, knn_res, rf_res]


In [68]:
results = {}

# Ordinary Encoder

In [69]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder?

In [70]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
data_enc = encode_func(data, enc)
data_enc.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1.0,22.0,1,0,7.25,2.0
1,1,1,0.0,38.0,1,0,71.2833,0.0
2,1,3,0.0,26.0,0,0,7.925,2.0
3,1,1,0.0,35.0,1,0,53.1,2.0
4,0,3,1.0,35.0,0,0,8.05,2.0


In [71]:
data_enc[['Sex','Embarked']].nunique()

Sex         2
Embarked    3
dtype: int64

In [72]:
results['Ordinal_encoding'] = evaluate_encoding(data_enc)

F1 results
LogReg 0.7139
KNN 0.7144
RF 0.7484


# CountEncoder



In [73]:
!pip install category_encoders



In [74]:
from category_encoders import CountEncoder
enc = CountEncoder()
data_enc = encode_func(data, enc)
data_enc.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,577,22.0,1,0,7.25,644
1,1,1,312,38.0,1,0,71.2833,168
2,1,3,312,26.0,0,0,7.925,644
3,1,1,312,35.0,1,0,53.1,644
4,0,3,577,35.0,0,0,8.05,644


Мы видим, что поле `Sex` было закодировано с помощью чисел 577 и 312. Действительно, ведь в данных 312 женщин и 577 мужчин.

In [75]:
data[['Embarked']].value_counts()

Embarked
S           644
C           168
Q            77
Name: count, dtype: int64

In [76]:
results['Count_encoding'] = evaluate_encoding(data_enc)

F1 results
LogReg 0.7144
KNN 0.7081
RF 0.7488


In [77]:
data_enc['Sex'] = data_enc['Sex']/data.shape[0]
data_enc['Embarked'] = data_enc['Embarked']/data.shape[0]
data_enc.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0.649044,22.0,1,0,7.25,0.724409
1,1,1,0.350956,38.0,1,0,71.2833,0.188976
2,1,3,0.350956,26.0,0,0,7.925,0.724409
3,1,1,0.350956,35.0,1,0,53.1,0.724409
4,0,3,0.649044,35.0,0,0,8.05,0.724409


In [78]:
results['Frequency_encoding'] = evaluate_encoding(data_enc)

F1 results
LogReg 0.7144
KNN 0.7081
RF 0.7488


# OneHotEncoding

In [79]:
from category_encoders import OrdinalEncoder, OneHotEncoder
enc = OneHotEncoder()
enc.fit_transform(data[['Sex', 'Embarked']]).head()

Unnamed: 0,Sex_1,Sex_2,Embarked_1,Embarked_2,Embarked_3
0,1,0,1,0,0
1,0,1,0,1,0
2,0,1,1,0,0
3,0,1,1,0,0
4,1,0,1,0,0


In [80]:
data_enc = data.drop(['Sex', 'Embarked'], axis = 1).join(enc.fit_transform(data[['Sex', 'Embarked']], axis = 0))
data_enc.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_1,Sex_2,Embarked_1,Embarked_2,Embarked_3
0,0,3,22.0,1,0,7.25,1,0,1,0,0
1,1,1,38.0,1,0,71.2833,0,1,0,1,0
2,1,3,26.0,0,0,7.925,0,1,1,0,0
3,1,1,35.0,1,0,53.1,0,1,1,0,0
4,0,3,35.0,0,0,8.05,1,0,1,0,0


In [81]:
results['One_hot_encoding'] = evaluate_encoding(data_enc)

F1 results
LogReg 0.7152
KNN 0.7087
RF 0.7374


На самом деле, при кодировании с помощью `OneHotEncoding` часть колонок является избыточными. Выкинем их и посмотрим, как поменяется результат.

In [82]:
data_enc = data.drop(['Sex', 'Embarked'], axis = 1).join(enc.fit_transform(data[['Sex', 'Embarked']], axis = 0)).drop(['Sex_2', 'Embarked_3'], axis = 1)
data_enc.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_1,Embarked_1,Embarked_2
0,0,3,22.0,1,0,7.25,1,1,0
1,1,1,38.0,1,0,71.2833,0,0,1
2,1,3,26.0,0,0,7.925,0,1,0
3,1,1,35.0,1,0,53.1,0,1,0
4,0,3,35.0,0,0,8.05,1,1,0


In [83]:
results['One_hot_encoding_short'] = evaluate_encoding(data_enc)

F1 results
LogReg 0.7171
KNN 0.711
RF 0.7275


# Объединим результаты

In [84]:
pd.DataFrame(results, index = ['Logistic Regression', 'KNN', 'Random Forest']).T*100

Unnamed: 0,Logistic Regression,KNN,Random Forest
Ordinal_encoding,71.386903,71.438251,74.836136
Count_encoding,71.43568,70.813333,74.883211
Frequency_encoding,71.43568,70.813333,74.883211
One_hot_encoding,71.522564,70.870953,73.744904
One_hot_encoding_short,71.712106,71.101504,72.754164
