### PCA 예측

### Import

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

from collections import Counter

### Load Data

In [2]:
data_df = pd.read_csv('../titanic.csv')

In [3]:
feature_cols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked']
target_col = 'Cabin'

In [4]:
cabin_cap = data_df[target_col].dropna().apply(lambda x : x[0])

t_deck_index = cabin_cap[cabin_cap == 'T'].index[0]

del cabin_cap[t_deck_index]

In [5]:
deck_X = data_df.loc[list(data_df[target_col].dropna().index)][feature_cols]

deck_X = np.array(deck_X.drop(t_deck_index))

In [6]:
deck_label_encoder = LabelEncoder()

deck_label_encoder.fit(list(Counter(cabin_cap).keys()))

deck_data = deck_label_encoder.transform(cabin_cap)

### Train / Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(deck_X, deck_data, test_size=0.2, random_state=2, stratify=deck_data)

### Create Pipeline Object

In [8]:
numeric_features = ['SibSp', 'Parch', 'Fare']
numeric_transformer = RobustScaler() # StandardScaler()

categorical_features = ['Pclass', 'Embarked']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 

preprocessor = ColumnTransformer(
    transformers=[ # List of (name, transformer, column(s))
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

### Fit & Transform

In [10]:
preprocessor_pipe.fit(pd.DataFrame(X_train, columns=feature_cols))

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', RobustScaler(),
                                                  ['SibSp', 'Parch', 'Fare']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Pclass', 'Embarked'])]))])

In [11]:
X_train_transformed = preprocessor_pipe.transform(pd.DataFrame(X_train, columns=feature_cols))
X_test_transformed = preprocessor_pipe.transform(pd.DataFrame(X_test, columns=feature_cols))

### Over-sampling

In [12]:
smote = SMOTE(k_neighbors=1)

In [13]:
X_train_over, y_train_over = smote.fit_sample(X_train_transformed, y_train)

### 모델 객체 생성

In [14]:
from sklearn import decomposition

In [15]:
model = decomposition.PCA(n_components=8)

### 모델 객체 학습

In [16]:
model.fit(X_train_over)
x = model.transform(X_test_transformed)

In [17]:
x

array([[ 5.00031462e-01, -1.03882242e+00,  5.77745248e-01,
         2.63261632e-01, -1.62404690e-02, -5.25090870e-02,
        -3.60513613e-02, -7.02336214e-03],
       [-4.87171497e-01, -4.65470255e-01, -5.29885942e-01,
         2.34818343e-01, -1.83673698e-01,  9.15399115e-02,
        -7.07236248e-04, -5.84210467e-03],
       [ 5.42341820e-01, -1.03753832e+00,  5.69932736e-01,
         2.72990413e-01,  1.80812788e-03, -6.08816092e-02,
        -3.64929228e-02, -7.13312025e-03],
       [-5.48455540e-01, -4.67330191e-01, -5.18569983e-01,
         2.20726781e-01, -2.09816020e-01,  1.03667013e-01,
        -6.76606371e-05, -5.68312655e-03],
       [-5.44636653e-01, -4.67214290e-01, -5.19275132e-01,
         2.21604890e-01, -2.08186973e-01,  1.02911318e-01,
        -1.07515497e-04, -5.69303320e-03],
       [ 6.26001732e-01, -2.41100479e-01, -7.72544184e-01,
        -4.76965389e-01, -2.00632139e-02, -2.21531844e-01,
        -3.38172303e-02, -1.31889611e-03],
       [-1.05811809e+00, -1.096449

In [18]:
np.argmax(np.cumsum(model.explained_variance_ratio_) >= 0.95 ) + 1

6

In [19]:
model.singular_values_

array([20.29666357, 14.09648576,  9.52201229,  8.91605855,  7.28242389,
        6.70140138,  2.72637471,  1.13599199])

## Model Stacking

### Import

In [20]:
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn import neighbors, cluster, svm, ensemble
from xgboost import XGBClassifier

from vecstack import StackingTransformer

### Stacking Transformer 객체 생성

In [21]:
# models = [ 
#     ensemble.ExtraTreesClassifier(random_state=2, n_jobs=-1, n_estimators=100, max_depth=3, max_leaf_nodes=8)
#     , ensemble.RandomForestClassifier(random_state=2, n_jobs=-1, n_estimators=100, max_depth=3, max_leaf_nodes=8) 
#     , XGBClassifier(seed=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth = 3, max_leaf_nodes=8)
#     , neighbors.KNeighborsClassifier(1)
#     , cluster.KMeans(n_clusters=8)
#     , svm.SVC(C=1000, gamma=1, kernel='rbf')
#     , svm.LinearSVC(C=10, random_state=2)
# ]
estimators = [ 
    ('ExtraTrees', ensemble.ExtraTreesClassifier(random_state=2, n_jobs=-1, n_estimators=100, max_depth=4, max_leaf_nodes=8))
    , ('RandomForest', ensemble.RandomForestClassifier(random_state=2, n_jobs=-1, n_estimators=100, max_depth=4, max_leaf_nodes=8))
    , ('GB', ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=4, min_samples_split=5, learning_rate=0.01, max_leaf_nodes=8))
    , ('XGB', XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.01, n_estimators = 100, max_depth = 3, eval_metric='mlogloss', max_leaf_nodes=8))
    , ('KNN', neighbors.KNeighborsClassifier(1))
    , ('KNN2', neighbors.KNeighborsClassifier(2))
    , ('SVC', svm.SVC(C=1000, gamma=1, kernel='sigmoid', random_state=2))
]

In [22]:
# S_train, S_test = stacking(models, 
#                            X_train_over, y_train_over, X_test_transformed, 
#                            regression = False, 
#                            metric = accuracy_score, 
#                            n_folds = 5, stratified = True, shuffle = True, 
#                            random_state = 2, verbose = 2)
stack = StackingTransformer(estimators, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 3, stratified = True, shuffle = True, 
                            random_state = 2, verbose = 2)


### Fit

In [23]:
stack = stack.fit(X_train_over, y_train_over)

# stack = stack.fit(X_train_transformed, y_train)

task:         [classification]
n_classes:    [7]
metric:       [accuracy_score]
variant:      [A]
n_estimators: [7]

estimator  0: [ExtraTrees: ExtraTreesClassifier]
    fold  0:  [0.62727273]
    fold  1:  [0.57272727]
    fold  2:  [0.54128440]
    ----
    MEAN:     [0.58042813] + [0.03552441]

estimator  1: [RandomForest: RandomForestClassifier]
    fold  0:  [0.61818182]
    fold  1:  [0.57272727]
    fold  2:  [0.56880734]
    ----
    MEAN:     [0.58657214] + [0.02240863]

estimator  2: [GB: GradientBoostingClassifier]
    fold  0:  [0.75454545]
    fold  1:  [0.70909091]
    fold  2:  [0.69724771]
    ----
    MEAN:     [0.72029469] + [0.02469685]

estimator  3: [XGB: XGBClassifier]
Parameters: { "max_leaf_nodes" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you fi

### Stacked Feature

In [24]:
S_train = stack.transform(X_train_over)
S_test = stack.transform(X_test_transformed)

Train set was detected.
Transforming...

estimator  0: [ExtraTrees: ExtraTreesClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    ----
    DONE

estimator  1: [RandomForest: RandomForestClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    ----
    DONE

estimator  2: [GB: GradientBoostingClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    ----
    DONE

estimator  3: [XGB: XGBClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    ----
    DONE

estimator  4: [KNN: KNeighborsClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    ----
    DONE

estimator  5: [KNN2: KNeighborsClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    ----
    DONE

estimator  6: [SVC: SVC]
    model from fold  0: done


### Second Level Estimator

In [25]:
model = XGBClassifier(seed=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=4, eval_metric='mlogloss', max_leaf_nodes=8) 
model = model.fit(S_train, y_train_over) 

Parameters: { "max_leaf_nodes" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




### Prediction

In [26]:
y_pred = model.predict(S_test) 
print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

Final prediction score: [0.58536585]


스태킹을 Optimize 해도 KNN을 넘기 힘들것 같다