## Pipeline 전처리

### Import

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from collections import Counter

### Data Load

In [2]:
data_df = pd.read_csv('../titanic.csv')

In [3]:
feature_cols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked']
target_col = 'Cabin'

In [4]:
cabin_cap = data_df[target_col].dropna().apply(lambda x : x[0])

t_deck_index = cabin_cap[cabin_cap == 'T'].index[0]

del cabin_cap[t_deck_index]

In [5]:
deck_X = data_df.loc[list(data_df['Cabin'].dropna().index)][feature_cols]

deck_X = np.array(deck_X.drop(t_deck_index))

In [6]:
deck_label_encoder = LabelEncoder()

deck_label_encoder.fit(list(Counter(cabin_cap).keys()))

deck_data = deck_label_encoder.transform(cabin_cap)

### Train / Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(deck_X, deck_data, test_size=0.2, random_state=2, stratify=deck_data)

### Create Pipeline Object

In [8]:
numeric_features = ['SibSp', 'Parch', 'Fare']
numeric_transformer = RobustScaler() # StandardScaler()

categorical_features = ['Pclass', 'Embarked']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 

preprocessor = ColumnTransformer(
    transformers=[ # List of (name, transformer, column(s))
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

### Fit & Transform

In [10]:
preprocessor_pipe.fit(pd.DataFrame(X_train, columns=feature_cols))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', RobustScaler(),
                                                  ['SibSp', 'Parch', 'Fare']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Pclass', 'Embarked'])]))])

In [11]:
X_train_transformed = preprocessor_pipe.transform(pd.DataFrame(X_train, columns=feature_cols))
X_test_transformed = preprocessor_pipe.transform(pd.DataFrame(X_test, columns=feature_cols))

In [12]:
X_train_transformed[0]

array([ 0.        ,  0.        , -0.49326395,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ])

### Grid Search

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# 아래 param_grid dict 의 C & gamma 에 후보 Hyper-params 값들을 리스트업합니다.
param_grid = {'C' : [0.1, 1, 10, 100, 1000], 
             'gamma' : [1, 0.1, 0.01, 0.001, 0.0001],
             'kernel' : ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)

grid.fit(X_train_transformed, y_train)
print('The best parameters are ', grid.best_params_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.0

[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV] END ...................C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV] END ...................

In [14]:
from sklearn.metrics import classification_report

grid_predictions = grid.predict(X_test_transformed)
print(classification_report(y_test, grid_predictions)) # Precision, Recall, F1-score 등을 확인할 수 있습니다.

print("Accuracy on Training set: {:.3f}".format(grid.score(X_train_transformed, y_train)))
print("Accuracy on Test set: {:.3f}".format(grid.score(X_test_transformed, y_test)))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.58      0.78      0.67         9
           2       0.47      0.58      0.52        12
           3       0.67      0.29      0.40         7
           4       0.17      0.17      0.17         6
           5       0.33      0.33      0.33         3
           6       1.00      1.00      1.00         1

    accuracy                           0.49        41
   macro avg       0.60      0.50      0.51        41
weighted avg       0.52      0.49      0.48        41

Accuracy on Training set: 0.765
Accuracy on Test set: 0.488


In [15]:
from sklearn.svm import LinearSVC

# 아래 param_grid dict 의 C & gamma 에 후보 Hyper-params 값들을 리스트업합니다.
param_grid = {'C' : [0.1, 1, 10, 100, 1000],
             'random_state' : [2]}

grid = GridSearchCV(LinearSVC(), param_grid, refit=True, verbose=2)

grid.fit(X_train_transformed, y_train)
print('The best parameters are ', grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ...............................C=10, random_state=2; total time=   0.0s
[CV] END ...............................C=10, ran

In [16]:
LinearSVC().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'loss', 'max_iter', 'multi_class', 'penalty', 'random_state', 'tol', 'verbose'])

In [17]:
grid_predictions = grid.predict(X_test_transformed)
print(classification_report(y_test, grid_predictions))

print("Accuracy on Training set: {:.3f}".format(grid.score(X_train_transformed, y_train)))
print("Accuracy on Test set: {:.3f}".format(grid.score(X_test_transformed, y_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.30      0.33      0.32         9
           2       0.33      0.50      0.40        12
           3       0.00      0.00      0.00         7
           4       0.17      0.17      0.17         6
           5       0.40      0.67      0.50         3
           6       0.50      1.00      0.67         1

    accuracy                           0.32        41
   macro avg       0.24      0.38      0.29        41
weighted avg       0.23      0.32      0.26        41

Accuracy on Training set: 0.426
Accuracy on Test set: 0.317


### Over-sampling

In [18]:
from imblearn.over_sampling import SMOTE

In [19]:
smote = SMOTE(k_neighbors=1)

In [20]:
X_train_over, y_train_over = smote.fit_sample(X_train_transformed, y_train)

In [21]:
len(X_train_over)

329

### Grid Search after Over-sampling

In [22]:
param_grid = {'C' : [0.1, 1, 10, 100, 1000], 
             'gamma' : [1, 0.1, 0.01, 0.001, 0.0001],
             'kernel' : ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)

grid.fit(X_train_over, y_train_over)
print('The best parameters are ', grid.best_params_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.0

[CV] END ......................C=1000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ......................C=1000, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .....................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ....................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ...................

In [23]:
grid_predictions = grid.predict(X_test_transformed)
print(classification_report(y_test, grid_predictions)) # Precision, Recall, F1-score 등을 확인할 수 있습니다.

print("Accuracy on Training set: {:.3f}".format(grid.score(X_train_over, y_train_over)))
print("Accuracy on Test set: {:.3f}".format(grid.score(X_test_transformed, y_test)))

              precision    recall  f1-score   support

           0       0.20      0.33      0.25         3
           1       0.64      0.78      0.70         9
           2       0.62      0.42      0.50        12
           3       0.67      0.29      0.40         7
           4       0.30      0.50      0.37         6
           5       0.33      0.33      0.33         3
           6       1.00      1.00      1.00         1

    accuracy                           0.49        41
   macro avg       0.54      0.52      0.51        41
weighted avg       0.54      0.49      0.49        41

Accuracy on Training set: 0.839
Accuracy on Test set: 0.488


In [24]:
param_grid = {'C' : [0.1, 1, 10, 100, 1000],
             'random_state' : [2]}

grid = GridSearchCV(LinearSVC(), param_grid, refit=True, verbose=2)

grid.fit(X_train_over, y_train_over)
print('The best parameters are ', grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ..............................C=0.1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ................................C=1, random_state=2; total time=   0.0s
[CV] END ...............................C=10, random_state=2; total time=   0.0s
[CV] END ...............................C=10, ran

In [25]:
grid_predictions = grid.predict(X_test_transformed)
print(classification_report(y_test, grid_predictions))

print("Accuracy on Training set: {:.3f}".format(grid.score(X_train_over, y_train_over)))
print("Accuracy on Test set: {:.3f}".format(grid.score(X_test_transformed, y_test)))

              precision    recall  f1-score   support

           0       0.17      0.67      0.27         3
           1       0.43      0.33      0.38         9
           2       0.42      0.42      0.42        12
           3       0.50      0.14      0.22         7
           4       0.00      0.00      0.00         6
           5       0.33      0.67      0.44         3
           6       0.50      1.00      0.67         1

    accuracy                           0.34        41
   macro avg       0.34      0.46      0.34        41
weighted avg       0.35      0.34      0.31        41

Accuracy on Training set: 0.587
Accuracy on Test set: 0.341


## 기타 모델 테스트

### 모델 객체 생성

In [26]:
from sklearn import neighbors, cluster, svm

In [27]:
knn_model = neighbors.KNeighborsClassifier(1)
kmeans_model = cluster.KMeans(n_clusters=8)
svm_model = svm.SVC()
lsvm_model = svm.LinearSVC()

### 학습 & 테스트

In [28]:
knn_model.fit(X_train_transformed, y_train)
kmeans_model.fit(X_train_transformed, y_train)
svm_model.fit(X_train_transformed, y_train)
lsvm_model.fit(X_train_transformed, y_train)

print(knn_model.predict(X_test_transformed))
print(knn_model.score(X_test_transformed, y_test))

print()

print(kmeans_model.predict(X_test_transformed))
print(kmeans_model.score(X_test_transformed, y_test))

print()

print(svm_model.predict(X_test_transformed))
print(svm_model.score(X_test_transformed, y_test))

print()

print(lsvm_model.predict(X_test_transformed))
print(lsvm_model.score(X_test_transformed, y_test))

[3 2 1 4 2 2 3 1 5 4 1 3 4 6 4 1 2 2 2 2 1 1 4 1 3 2 1 2 1 6 3 2 5 1 1 2 0
 0 4 3 3]
0.6829268292682927

[7 0 7 0 0 0 6 0 6 2 0 2 0 6 5 0 7 1 0 0 0 3 6 0 0 2 1 0 1 6 6 0 6 7 5 1 0
 0 7 0 5]
-35.51295898718044

[1 4 1 4 4 2 5 1 5 2 2 2 2 5 2 4 1 2 2 2 4 1 5 2 2 2 2 2 2 5 5 2 5 1 2 2 4
 2 1 4 1]
0.3170731707317073

[1 4 1 4 4 2 5 1 5 2 2 2 2 6 2 4 1 2 2 2 4 2 5 2 2 2 2 2 1 6 5 2 5 1 2 1 4
 2 1 1 1]
0.3170731707317073


In [29]:
knn_model = neighbors.KNeighborsClassifier(1)
kmeans_model = cluster.KMeans(n_clusters=8)
svm_model = svm.SVC()
lsvm_model = svm.LinearSVC()

In [30]:
knn_model.fit(X_train_over, y_train_over)
kmeans_model.fit(X_train_over, y_train_over)
svm_model.fit(X_train_over, y_train_over)
lsvm_model.fit(X_train_over, y_train_over)

print(knn_model.predict(X_train_over))
print(knn_model.score(X_train_over, y_train_over))

print()

print(kmeans_model.predict(X_train_over))
print(kmeans_model.score(X_train_over, y_train_over))

print()

print(svm_model.predict(X_train_over))
print(svm_model.score(X_train_over, y_train_over))

print()

print(lsvm_model.predict(X_train_over))
print(lsvm_model.score(X_train_over, y_train_over))

[4 1 0 3 2 4 4 2 4 2 4 2 1 3 1 4 2 4 3 4 2 2 6 6 4 0 2 1 2 3 3 2 3 1 3 2 1
 2 2 2 4 1 4 2 1 1 1 2 0 1 2 0 0 1 1 1 5 4 2 2 3 2 4 1 1 4 1 2 4 1 3 2 1 0
 2 4 1 3 2 3 2 0 4 4 4 2 4 4 2 3 1 4 1 6 0 3 4 1 3 2 1 4 0 5 3 2 3 4 3 2 2
 2 1 4 1 1 0 2 0 0 1 3 5 4 2 4 2 2 3 1 2 2 1 5 5 1 1 4 5 3 1 2 3 2 4 5 1 3
 4 4 2 3 4 0 1 4 2 5 0 2 1 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 4 4 5 4 5 4 5 5 5
 5 5 5 5 4 5 5 5 5 5 5 5 4 5 5 5 5 4 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
0.9331306990881459

[4 6 2 7 3 2 7 4 2 6 4 1 6 4 7 0 2 4 6 4 3 1 0 0 2 4 4 6 7 4 2 2 2 3 4 4 6
 4 4 2 5 1 6 7 4 7 6 6 6 7 6 4 6 1 2 4 0 4 2 1 7 4 4 2 4 4 4 4 5 4 4 7 7 7
 4 4 1 6 3 7 4 4 4 7 4 2 5 4 2 4 6 4 4 0 6 7 7 6 4 1 2 4 6 5 7 1 4 4 7 2 4
 2 2 4 6 3 6 4 4 4 7 2 5 5 7 4 3 2 2 2 4 4 6 0 0 6 7 0 5 6 6 6 2 2 5 0 4 2
 4 4 4 6 7 4

### Cross Validation with KNN

In [31]:
from sklearn import model_selection

In [32]:
kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=2)

valid_scores_kf = model_selection.cross_val_score(neighbors.KNeighborsClassifier(1), X_train_over, y_train_over,
                                                  cv=kf, verbose=1, n_jobs=-1)

stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

valid_scores_s_kf = model_selection.cross_val_score(neighbors.KNeighborsClassifier(1), X_train_over, y_train_over,
                                                    cv=stratified_kf, verbose=1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   14.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.6s finished


In [33]:
print('Cross-Validation Score (K-Fold): {:.2f}%'.format(np.mean(valid_scores_kf * 100)))
print('Cross-Validation Score (Stratified K-Fold): {:.2f}%'.format(np.mean(valid_scores_s_kf * 100)))

Cross-Validation Score (K-Fold): 69.65%
Cross-Validation Score (Stratified K-Fold): 70.20%


### CV with SVM

In [34]:
kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=2)

params = {'C': 1000, 'gamma': 1, 'kernel': 'rbf'}

valid_scores_kf = model_selection.cross_val_score(svm.SVC(**params), X_train_over, y_train_over,
                                                  cv=kf, verbose=1, n_jobs=-1)

stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

valid_scores_s_kf = model_selection.cross_val_score(svm.SVC(**params), X_train_over, y_train_over,
                                                    cv=stratified_kf, verbose=1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [35]:
print('Cross-Validation Score (K-Fold): {:.2f}%'.format(np.mean(valid_scores_kf * 100)))
print('Cross-Validation Score (Stratified K-Fold): {:.2f}%'.format(np.mean(valid_scores_s_kf * 100)))

Cross-Validation Score (K-Fold): 70.83%
Cross-Validation Score (Stratified K-Fold): 71.42%


In [36]:
kf = model_selection.KFold(n_splits=10, shuffle=True, random_state=2)

params = {'C': 10, 'random_state': 2}

valid_scores_kf = model_selection.cross_val_score(svm.LinearSVC(**params), X_train_over, y_train_over,
                                                  cv=kf, verbose=1, n_jobs=-1)

stratified_kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

valid_scores_s_kf = model_selection.cross_val_score(svm.LinearSVC(**params), X_train_over, y_train_over,
                                                    cv=stratified_kf, verbose=1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.2s finished


In [37]:
print('Cross-Validation Score (K-Fold): {:.2f}%'.format(np.mean(valid_scores_kf * 100)))
print('Cross-Validation Score (Stratified K-Fold): {:.2f}%'.format(np.mean(valid_scores_s_kf * 100)))

Cross-Validation Score (K-Fold): 53.20%
Cross-Validation Score (Stratified K-Fold): 54.40%


K-Fold 적용시 k=1 인 KNN 또는  'C': 1000, 'gamma': 1, 'kernel': 'rbf'인 SVC를 사용하는것이 가장 나은 결과를 보였지만 70% 정도에 그쳤다

하지만 k=1인 단순 KNN의 경우 0.9 이상의 점수를 보였기에 채택할만 하다고 느껴진다