In [89]:
## 모듈 임포트
import numpy as np 
import pandas as pd
import warnings
warnings.simplefilter("ignore")

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split  ## train test 
from sklearn.preprocessing import LabelEncoder      ## object 정수 인코딩
from sklearn.preprocessing import OneHotEncoder     ## object one-hot
from sklearn.preprocessing import StandardScaler    ## 표준 정규 분포 이용

from sklearn.linear_model import LinearRegression ## 회귀 분석
from sklearn.svm import SVC                       ## svm 분류모델
from sklearn.tree import DecisionTreeClassifier   ## Tree 분류모델
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score       ## 정확도 검증

In [2]:
df = pd.read_csv('https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/fa71405126017e6a37bea592440b4bee94bf7b9e/titanic.csv')
df.head(1)
df.columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [16]:
## 원본 데이터는 유지 
dt = df.copy()

In [17]:
## Na 데이터 채우기
dt.Age.fillna(dt.Age.mean(),inplace=True)
dt.Embarked.fillna('N',inplace=True)
dt.Cabin.fillna('N',inplace=True)


In [32]:
## object 
## Sex ,Embarked, new_Cabin
gender_enc = LabelEncoder()
gender = gender_enc.fit_transform(dt.Sex)
embark_enc = LabelEncoder()
embark = embark_enc.fit_transform(dt.Embarked)
cabin_enc = LabelEncoder()
cabin = cabin_enc.fit_transform(dt.Cabin.str.slice(0,1))
pclass_enc = OneHotEncoder()
pclass = pclass_enc.fit_transform(dt.Pclass.values.reshape(-1,1)).toarray()
age_enc = LabelEncoder()
age = age_enc.fit_transform(pd.cut(dt.Age,6))
age_oenc = OneHotEncoder()
age = age_oenc.fit_transform(age.reshape(-1,1)).toarray()
enc_data = np.concatenate([gender.reshape(-1,1),embark.reshape(-1,1),
                cabin.reshape(-1,1),pclass,age],axis = 1)
adult = np.where(df.Name.str.contains('Mr','Miss'),1,0)
##


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [34]:
## X,y 
feature_col = ['SibSp','Parch']
dep_col = ['Survived']
X = pd.concat([dt[feature_col],pd.DataFrame(enc_data),pd.Series(adult)],axis=1)
y = dt[dep_col]
## train, test 분리
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [163]:
## 모델 생성 및 학습
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [164]:
## 모델 평가
y_hat = dt_clf.predict(X_test)
acc = accuracy_score(y_test,y_hat)
acc

0.7877094972067039

In [171]:
## 모델 생성 및 학습
svc_clf = SVC(C=0.3,kernel='linear')
svc_clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SVC(C=0.3, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [68]:
Cs = np.linspace(0.001,2,10)
gammas = np.linspace(0.001,2,10)
kernels = ['linear', 'rbf', 'sigmoid']

param_score = []
for C in Cs:
    for kernel in kernels:
        for gamma in gammas:
            test_acc = [] 
            for _ in np.arange(5):
                X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
                svc_clf = SVC(C=C, gamma = gamma, kernel=kernel,probability=True)
                svc_clf.fit(X_train,y_train)
                acc = accuracy_score(svc_clf.predict(X_test),y_test)
                test_acc.append(acc)
            avg_acc = sum(test_acc)/5
            param = (C,kernel,gamma,avg_acc)
            param_score.append(param)
param = pd.DataFrame(param_score,columns=['C','kernel','gamma','score'])
best_param = param.loc[param.score==param.score.max(),['C','kernel','gamma']].to_dict('r')[0]
best_param['probability'] = True
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
clf = SVC(**best_param)
clf.fit(X_train,y_train)
acc = accuracy_score(clf.predict(X_test),y_test)
acc



0.7932960893854749

In [76]:
rf_acc = accuracy_score(rf.predict(X_test),y_test)
svc_acc = accuracy_score(clf.predict(X_test),y_test)
result = np.where((svc_proba + rf_proba)[:,0] > (svc_proba + rf_proba)[:,1],0,1)
rf_svc_acc = accuracy_score(result, y_test)

In [84]:
rf_proba = rf.predict_proba(X_test)
svc_proba = clf.predict_proba(X_test)
rf_svc_acc = accuracy_score(result, y_test)

In [88]:
rf_svc_acc

0.8491620111731844

In [74]:
abs(clf.predict_proba(X_test)[:,0] - (clf.predict_proba(X_test)[:,1])) < 0.2

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False,  True,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [62]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
acc = accuracy_score(rf.predict(X_test),y_test)
acc

0.8044692737430168

In [98]:
X_train.columns = [  'X'+str(i) for i in np.arange(X_train.columns.size)]
X_test.columns = [  'X'+str(i) for i in np.arange(X_train.columns.size)]
cat_boost = CatBoostClassifier(iterations=3000)
cat_boost.fit(X_train,y_train)

Learning rate set to 0.003254
0:	learn: 0.6912634	total: 800us	remaining: 2.4s
1:	learn: 0.6892663	total: 1.7ms	remaining: 2.54s
2:	learn: 0.6871880	total: 2.5ms	remaining: 2.5s
3:	learn: 0.6851576	total: 3.28ms	remaining: 2.46s
4:	learn: 0.6832022	total: 4.06ms	remaining: 2.43s
5:	learn: 0.6810215	total: 4.86ms	remaining: 2.42s
6:	learn: 0.6792941	total: 5.64ms	remaining: 2.41s
7:	learn: 0.6771452	total: 6.53ms	remaining: 2.44s
8:	learn: 0.6755451	total: 7.82ms	remaining: 2.6s
9:	learn: 0.6732875	total: 8.91ms	remaining: 2.66s
10:	learn: 0.6712315	total: 10.1ms	remaining: 2.74s
11:	learn: 0.6690361	total: 11.1ms	remaining: 2.77s
12:	learn: 0.6672912	total: 12.4ms	remaining: 2.85s
13:	learn: 0.6652502	total: 13.5ms	remaining: 2.88s
14:	learn: 0.6631722	total: 14.4ms	remaining: 2.86s
15:	learn: 0.6613081	total: 15.2ms	remaining: 2.84s
16:	learn: 0.6592678	total: 16.1ms	remaining: 2.83s
17:	learn: 0.6573161	total: 17ms	remaining: 2.81s
18:	learn: 0.6553170	total: 17.9ms	remaining: 2.81s


<catboost.core.CatBoostClassifier at 0x2ceb97c89c8>

In [99]:
accuracy_score(cat_boost.predict(X_test),y_test)

0.7877094972067039

In [92]:
X_train

Unnamed: 0,SibSp,Parch,0,1,2,3,4,5,6,7,8,9,10,11,0.1
220,0,0,1.0,3.0,7.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1
259,0,1,0.0,3.0,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
154,0,0,1.0,3.0,7.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1
656,0,0,1.0,3.0,7.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1
860,2,0,1.0,3.0,7.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,0,0,1.0,3.0,7.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1
439,0,0,1.0,3.0,7.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
640,0,0,1.0,3.0,7.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1
191,0,0,1.0,3.0,7.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1


In [58]:
param.loc[param.score==param.score.max(),['C','kernel','gamma']].to_dict('r')[0]

{'C': 2.0, 'kernel': 'rbf', 'gamma': 0.22311111111111112}

In [44]:
pd.DataFrame(param_score)[pd.DataFrame(param_score)[3]==pd.DataFrame(param_score)[3].max()]

Unnamed: 0,0,1,2,3
281,2.0,rbf,0.223111,0.832402


In [172]:
y_hat = svc_clf.predict(X_test)
acc = accuracy_score(y_test,y_hat)
acc

0.8044692737430168

In [37]:
svc_clf.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto_deprecated',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [106]:
x = pd.DataFrame(np.random.randint(0,3,5))

In [113]:

np.zeros((5,3)) = 1

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [126]:
y = np.zeros((5,3))

In [122]:
y[1,] 

0.0

In [119]:
y[0,0] = 1

In [127]:
for k,v in enumerate(x[0]):
    y[k,v] = 1

In [128]:
y

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])