In [53]:
## 모듈 임포트
import numpy as np 
import pandas as pd
import warnings
warnings.simplefilter("ignore")


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split  ## train test 

from sklearn.preprocessing import LabelEncoder      ## object 정수 인코딩
from sklearn.preprocessing import OneHotEncoder     ## object one-hot
from sklearn.preprocessing import StandardScaler    ## 표준 정규 분포 이용

from sklearn.linear_model import LinearRegression ## 회귀 분석
from sklearn.svm import SVC                       ## svm 분류모델
from sklearn.tree import DecisionTreeClassifier   ## Tree 분류모델
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import accuracy_score       ## 정확도 검증

In [2]:
df = pd.read_csv('https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/fa71405126017e6a37bea592440b4bee94bf7b9e/titanic.csv')

In [3]:
## 원본 데이터는 유지 
dt = df.copy()

In [4]:
## Na 데이터 채우기
dt.Age.fillna(dt.Age.mean(),inplace=True)
dt.Embarked.fillna('N',inplace=True)
dt.Cabin.fillna('N',inplace=True)


In [21]:
## object 
## Sex ,Embarked, new_Cabin
gender_enc = LabelEncoder()
gender = gender_enc.fit_transform(dt.Sex)
embark_enc = LabelEncoder()
embark = embark_enc.fit_transform(dt.Embarked)
cabin_enc = LabelEncoder()
cabin = cabin_enc.fit_transform(dt.Cabin.str.slice(0,1))
pclass_enc = OneHotEncoder()
pclass = pclass_enc.fit_transform(dt.Pclass.values.reshape(-1,1)).toarray()
age_enc = LabelEncoder()
age = age_enc.fit_transform(pd.cut(dt.Age,6))
age_oenc = OneHotEncoder()
age = age_oenc.fit_transform(age.reshape(-1,1)).toarray()
enc_data = np.concatenate([gender.reshape(-1,1),embark.reshape(-1,1),
                cabin.reshape(-1,1),pclass,age],axis = 1)
adult = np.where(df.Name.str.contains('Mr','Miss'),1,0)
##

In [22]:
## X,y 
feature_col = ['SibSp','Parch']
dep_col = ['Survived']
X = pd.concat([dt[feature_col],pd.DataFrame(enc_data),pd.Series(adult)],axis=1)
y = dt[dep_col]

In [31]:
## 모델학습 with cross validation 5times

n_split = 5
cross_acc = 0
for i in np.arange(n_split):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
    lor_clf.fit(X_train, y_train)
    acc = lor_clf.score(X_test, y_test)
    cross_acc += acc
cross_acc/n_split


0.8011173184357542

# ==========================================================

# **SupportVectorClassifier**

In [165]:
## 모델학습 with cross_validation using sckit-learn function

n_splits = 5
test_size = 0.2
cv = ShuffleSplit(n_splits = n_splits, test_size = test_size)
np.mean(cross_val_score(lor_clf, X, y, cv=cv))

0.7932960893854749

In [166]:
Cs = np.linspace(0.1,20,20)
gammas = np.linspace(0.001,2,10)
kernels = ['linear', 'rbf', 'sigmoid']


params = { 'kernel': kernels, 'C' : Cs, 'gamma' : gammas}
clf = GridSearchCV( SVC(), params, cv = n_splits, n_jobs = -1)
clf.fit(X,y)
params = clf.best_params_
params['probability'] = True
svc_clf = SVC(**params)
svc_clf.fit(X,y)



SVC(C=2.194736842105263, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.22311111111111112,
    kernel='rbf', max_iter=-1, probability=True, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [152]:
n_splits = 5
test_size = 0.2
cv = ShuffleSplit(n_splits = n_splits, test_size = test_size)
np.mean(cross_val_score(svc_clf, X, y, cv=cv))

0.8033519553072626

In [153]:
Cs = np.linspace(0.1,20,20)
gammas = np.linspace(0.001,2,10)
kernels = ['linear', 'rbf', 'sigmoid']

n_splits = 5
test_size = 0.2
param = []

for kernel in kernels:
    for C in Cs:
        for gamma in gammas:
            cv = ShuffleSplit(n_splits = n_splits, test_size = test_size)
            svc_clf = SVC(kernel = kernel, C=C, gamma = gamma)
            score = np.mean(cross_val_score(svc_clf, X, y, cv=cv))
            param.append((kernel, C, gamma, score))
param

KeyboardInterrupt: 

In [64]:
param = pd.DataFrame(param)

In [65]:
param.columns = ['kernel','C','gamma','score']

In [66]:
best_param = param.loc[param.score == param.score.max(), ['kernel','C','gamma']].to_dict('r')[0]
best_param
svc_clf = SVC(**best_param)

In [67]:
svc_clf

SVC(C=16.857894736842105, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# ==========================================================

# **DecisionTreeClassifier**

In [148]:
dt_clf= DecisionTreeClassifier()
dt_clf.fit(X,y)
dt_clf.score(X,y)

0.8956228956228957

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [76]:
dt_clf_split = DecisionTreeClassifier()
dt_clf_split.fit(X_train, y_train)
dt_clf_split.score(X_test,y_test)

0.7877094972067039

In [78]:
max_depth = np.arange(3,10)
min_samples_split = np.arange(2,5)
criterion = np.linspace(0.1,20,20)

In [85]:
params = { 'criterion' : ['gini','entropy'], 'max_depth' : max_depth, 'min_samples_split' : min_samples_split}
clf = GridSearchCV( DecisionTreeClassifier(), params, cv = 5, n_jobs = -1)
clf.fit(X,y)
clf.best_score_

0.8249158249158249

In [86]:
dt_clf = DecisionTreeClassifier(**clf.best_params_)
dt_clf.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [92]:
n_splits = 5
test_size = 0.2
cv = ShuffleSplit(n_splits = n_splits, test_size = test_size)
np.mean(cross_val_score(dt_clf, X, y, cv=cv))

0.8212290502793296

# =================================================

# **LogisticRegression**

In [189]:
lor_clf= LogisticRegression()
lor_clf.fit(X,y)
lor_clf.score(X,y)

0.8069584736251403

In [190]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [191]:
lor_clf_split = LogisticRegression()
lor_clf_split.fit(X_train, y_train)
lor_clf_split.score(X_test,y_test)

0.776536312849162

In [192]:
penalty = ['l1', 'l2']
C = np.linspace(0.1,2,10)
params = { 'penalty' : ['l1', 'l2'], 
         'C' : np.linspace(0.1,2,10)}
clf = GridSearchCV( LogisticRegression(), params, cv = 5, n_jobs = -1)
clf.fit(X,y)
clf.best_score_

0.8013468013468014

In [193]:
lor_clf = LogisticRegression(**clf.best_params_)
lor_clf.fit(X,y)
lor_clf.score(X,y)

0.8092031425364759

In [194]:
n_splits = 5
test_size = 0.2
cv = ShuffleSplit(n_splits = n_splits, test_size = test_size)
np.mean(cross_val_score(lor_clf, X, y, cv=cv))

0.8223463687150836

# ================================================

# **LinearRegression**

In [195]:
lr_clf= LinearRegression()
lr_clf.fit(X,y)
lr_clf.score(X,y)

0.4125608803800451

# =================================================

# **RandomForestClassifier**

In [196]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X,y)
rf_clf.score(X,y)

0.8922558922558923

# ======================================================

# **ensemble 모델 voting**

In [113]:
# svc_clf.score(X,y)
# lor_clf.score(X,y)
# dt_clf.score(X,y)
# rf_clf.score(X,y)
# lr_clf.score(X,y)

0.8742985409652076

In [126]:
def hard_voting(X,y):
    svc_result = svc_clf.predict(X)
    lor_result = lor_clf.predict(X)
    dt_result = dt_clf.predict(X)
    rf_result = rf_clf.predict(X)
    lr_result = np.where(lr_clf.predict(X) > 0.5, 1, 0).reshape(-1,)
    result = svc_result + lor_result + dt_result + rf_result + lr_result
    predict_result = np.where(result/5 > 0.5, 1, 0)
    return accuracy_score(predict_result,y)

In [127]:
hard_voting(X,y)

0.8540965207631874

In [154]:
def soft_voting(X,y):
    svc_result = svc_clf.predict_proba(X)
    lor_result = lor_clf.predict_proba(X)
    dt_result = dt_clf.predict_proba(X)
    rf_result = rf_clf.predict_proba(X)
    lr_result = lr_clf.predict(X)
    result = svc_result + lor_result + dt_result + rf_result + lr_result
    predict_result = np.where(result[:,0] > result[:,1],0,1)
    return accuracy_score(y,predict_result)

In [157]:
a = np.arange(10).reshape(-1,1)
b = np.arange(10).reshape(-1,1)
np.concatenate([a,b],axis =1)

array([[0, 0],
       [1, 1],
       [2, 2],
       [3, 3],
       [4, 4],
       [5, 5],
       [6, 6],
       [7, 7],
       [8, 8],
       [9, 9]])

In [167]:
meta_clf = RandomForestClassifier()

In [183]:
def stacking_ensemble(X,y):
    svc_result = svc_clf.predict(X).reshape(-1,1)
    lor_result = lor_clf.predict(X).reshape(-1,1)
    dt_result = dt_clf.predict(X).reshape(-1,1)
    rf_result = rf_clf.predict(X).reshape(-1,1)
    lr_result = np.where(lr_clf.predict(X) > 0.5, 1, 0)
    result = np.concatenate([svc_result , lor_result,dt_result , rf_result , lr_result],axis = 1)
    meta_clf.fit(result,y)
    return meta_clf.score(result,y)
    

In [184]:
stacking_ensemble(X,y)

0.8945005611672279

In [185]:
meta_clf_soft = RandomForestClassifier()

In [186]:
def stacking_ensemble_soft(X,y):
    svc_result = svc_clf.predict_proba(X)
    lor_result = lor_clf.predict_proba(X)
    dt_result = dt_clf.predict_proba(X)
    rf_result = rf_clf.predict_proba(X)
    lr_result = lr_clf.predict(X)
    result = np.concatenate([svc_result,lor_result , dt_result , rf_result , lr_result],axis =1)
    meta_clf_soft.fit(result,y)
    return meta_clf_soft.score(result,y)

In [187]:
stacking_ensemble_soft(X,y)

0.8945005611672279

In [140]:
from sklearn.ensemble import RandomForestRegressor
rr_clf = RandomForestRegressor()
rr_clf.fit(X,y)
rr_clf.score(X,y)

0.6481397509094067

In [141]:
y_hat = np.where(rr_clf.predict(X) > 0.5, 1, 0)
accuracy_score(y_hat,y)

0.8877665544332211