In [151]:
import warnings
warnings.filterwarnings(action='ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
## enumerate함수 :열거하다
## 많은 양의 데이터인 list를 다룰 때 편리하게 쓸 수 있는 기능을 제공
## for-each, for
# for x in list: 리스트에 있는 것을 하나씩 꺼내서 처리하기가 아주 간단
#                    index가 없어서 불편!
#                    하나씩 꺼내서 출력, 연산하기는 용이
#                    리스트에 입력은 안된다.!
# for i in range(n): 리스트의 입력과 출력이 다 됨.
#                    인덱스로 리스트에 있는 값들을 접근해야하기 때문에
#                    코드가 복잡하게 보인다.

In [37]:
num_list = [100, 200, 300]

In [38]:
for x in num_list:
    print(x + 100)

200
300
400


In [39]:
for x in enumerate(num_list):
    print(x)

(0, 100)
(1, 200)
(2, 300)


In [40]:
for x in enumerate(num_list):
    print(x,':',x)

(0, 100) : (0, 100)
(1, 200) : (1, 200)
(2, 300) : (2, 300)


In [41]:
for i, x in enumerate(num_list):
    num_list[i] = x + 100

In [42]:
num_list

[200, 300, 400]

In [1]:
from sklearn.model_selection import KFold

In [2]:
kfold = KFold(n_splits=5)

In [3]:
kfold

KFold(n_splits=5, random_state=None, shuffle=False)

In [105]:
titanic_df = pd.read_csv('../titanic_train.csv')
titanic_df.head(2)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,2,1,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5,,S,,"England / Bennington, VT"
1,2,1,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0,,S,,"Cornwall / Akron, OH"


In [106]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915 entries, 0 to 914
Data columns (total 13 columns):
pclass       915 non-null int64
survived     915 non-null int64
name         915 non-null object
sex          915 non-null object
age          740 non-null float64
sibsp        915 non-null int64
parch        915 non-null int64
ticket       915 non-null object
fare         915 non-null float64
cabin        214 non-null object
embarked     913 non-null object
body         85 non-null float64
home.dest    526 non-null object
dtypes: float64(3), int64(4), object(6)
memory usage: 93.1+ KB


In [107]:
# 함수(drop처리)
# name, ticket,body, home.dest
def drop_features(df):
    df.drop(columns=['name', 'ticket', 'body', 'home.dest'], inplace=True)
    return df

In [108]:
titanic_df =  drop_features(titanic_df)
titanic_df

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,female,13.00,0,1,19.5000,,S
1,2,1,female,4.00,1,1,23.0000,,S
2,2,1,female,30.00,1,0,13.8583,,C
3,3,0,male,,0,0,7.7250,,Q
4,3,1,female,22.00,0,0,7.7250,,Q
...,...,...,...,...,...,...,...,...,...
910,3,1,male,,0,0,7.7750,,S
911,3,1,female,0.17,1,2,20.5750,,S
912,3,0,male,,0,0,8.0500,,S
913,3,1,female,,0,0,7.7333,,Q


In [109]:
# 함수(na처리)
# na처리
# age는 평균으로 채워주기
# cabin은 n으로 채워주기
# emvarked는 n으로 채워주기 (fillna())
# na 개수가 몇개인지 전체 카운트
def fillna(df):
    df['age'].fillna(df['age'].mean(), inplace=True)
    df['cabin'].fillna('N', inplace=True)
    df['embarked'].fillna('N', inplace=True)
    return df

In [110]:
titanic_df = fillna(titanic_df)
titanic_df

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,female,13.000000,0,1,19.5000,N,S
1,2,1,female,4.000000,1,1,23.0000,N,S
2,2,1,female,30.000000,1,0,13.8583,N,C
3,3,0,male,30.229054,0,0,7.7250,N,Q
4,3,1,female,22.000000,0,0,7.7250,N,Q
...,...,...,...,...,...,...,...,...,...
910,3,1,male,30.229054,0,0,7.7750,N,S
911,3,1,female,0.170000,1,2,20.5750,N,S
912,3,0,male,30.229054,0,0,8.0500,N,S
913,3,1,female,30.229054,0,0,7.7333,N,Q


In [111]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915 entries, 0 to 914
Data columns (total 9 columns):
pclass      915 non-null int64
survived    915 non-null int64
sex         915 non-null object
age         915 non-null float64
sibsp       915 non-null int64
parch       915 non-null int64
fare        915 non-null float64
cabin       915 non-null object
embarked    915 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 64.5+ KB


In [112]:
## 라벨인코딩 : sex, cabin, embarked

In [113]:
titanic_df['cabin'][:20]

0           N
1           N
2           N
3           N
4           N
5           N
6         E36
7           N
8         C68
9         E24
10          N
11          N
12    C22 C26
13          N
14          N
15          N
16          N
17          N
18          N
19          N
Name: cabin, dtype: object

In [114]:
# titanic_df['cabin'] = titanic_df['cabin'].str[:1]
# titanic_df['cabin'][:20]

In [115]:
def label_encoding(df):
    df['cabin'] = df['cabin'].str[:1]
    print(list(titanic_df['cabin'])[:10])
    le = LabelEncoder()
    features = ['sex', 'cabin', 'embarked']
    for f in features:
        le.fit(df[f])
        print('라벨링할 목록 => ', le.classes_)
        df[f] = le.transform(df[f])
    return df

In [116]:
titanic_df = label_encoding(titanic_df)
titanic_df

['N', 'N', 'N', 'N', 'N', 'N', 'E', 'N', 'C', 'E']
라벨링할 목록 =>  ['female' 'male']
라벨링할 목록 =>  ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'N' 'T']
라벨링할 목록 =>  ['C' 'N' 'Q' 'S']


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,0,13.000000,0,1,19.5000,7,3
1,2,1,0,4.000000,1,1,23.0000,7,3
2,2,1,0,30.000000,1,0,13.8583,7,0
3,3,0,1,30.229054,0,0,7.7250,7,2
4,3,1,0,22.000000,0,0,7.7250,7,2
...,...,...,...,...,...,...,...,...,...
910,3,1,1,30.229054,0,0,7.7750,7,3
911,3,1,0,0.170000,1,2,20.5750,7,3
912,3,0,1,30.229054,0,0,8.0500,7,3
913,3,1,0,30.229054,0,0,7.7333,7,2


In [117]:
titanic_df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,0,13.0,0,1,19.5,7,3
1,2,1,0,4.0,1,1,23.0,7,3
2,2,1,0,30.0,1,0,13.8583,7,0
3,3,0,1,30.229054,0,0,7.725,7,2
4,3,1,0,22.0,0,0,7.725,7,2


In [126]:
# 전처리 담당 함수 만들기
def preprocessing(df):
    df2 = fillna(df)
    df3 = drop_features(df2)
    df4 = label_encoding(df3)
    return df4

In [127]:
titanic_df2 = pd.read_csv('../titanic_train.csv')
titanic_df2.head(2)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,2,1,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5,,S,,"England / Bennington, VT"
1,2,1,"Wells, Miss. Joan",female,4.0,1,1,29103,23.0,,S,,"Cornwall / Akron, OH"


In [128]:
# 데이터 전처리 호출
titanic_df3 = preprocessing(titanic_df2)
titanic_df3

[7, 7, 7, 7, 7, 7, 4, 7, 2, 4]
라벨링할 목록 =>  ['female' 'male']
라벨링할 목록 =>  ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'N' 'T']
라벨링할 목록 =>  ['C' 'N' 'Q' 'S']


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,2,1,0,13.000000,0,1,19.5000,7,3
1,2,1,0,4.000000,1,1,23.0000,7,3
2,2,1,0,30.000000,1,0,13.8583,7,0
3,3,0,1,30.229054,0,0,7.7250,7,2
4,3,1,0,22.000000,0,0,7.7250,7,2
...,...,...,...,...,...,...,...,...,...
910,3,1,1,30.229054,0,0,7.7750,7,3
911,3,1,0,0.170000,1,2,20.5750,7,3
912,3,0,1,30.229054,0,0,8.0500,7,3
913,3,1,0,30.229054,0,0,7.7333,7,2


In [168]:
# train_test data split
data = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'cabin', 'embarked']
X_train, X_test, y_train, y_test = train_test_split(
    titanic_df3[data], # 문제
    titanic_df3['survived'], # 정답
    test_size = 0.2, # test-size비율
    random_state = 200 # 시드값 설정
)

In [169]:
# 8개의 변수로 1개의 target을 분류하는 문제
# y = ax1 + bx2 + cx3 + dx4 + ex5 + fx6 + gx7 + hx8
# ['pclass', survived,'sex', 'age', 'sibsp', 'parch', 'fare', 'cabin', 'embarked']
# survived = a*pclass + b*sex + c*age + d*sibsp + e*parch +
#            f*fare + g*cabin + h*embarked + y절편

In [170]:
df_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
svc_clf = SVC()
clf_list = [df_clf, rf_clf, svc_clf]

In [171]:
# train데이터 3가지 방법으로 훈련데이터 훈련시켜서 정확도 비교
for clf in clf_list:
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print(format(accuracy_score(y_test, pred)))

0.7923497267759563
0.7650273224043715
0.6721311475409836


In [172]:
kfold = KFold(n_splits=5)

In [176]:
for i, (train_index, test_index) in enumerate(kfold.split(titanic_df3[data])):
    print(i, "----------------")
    print('test_index>> ', test_index)
    print('train_index>> ', train_index)

0 ----------------
test_index>>  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182]
train_index>>  [183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 2

In [187]:
titanic_df3[data].values[[0,3]]

array([[ 2.        ,  0.        , 13.        ,  0.        ,  1.        ,
        19.5       ,  7.        ,  3.        ],
       [ 3.        ,  1.        , 30.22905405,  0.        ,  0.        ,
         7.725     ,  7.        ,  2.        ]])

In [188]:
titanic_df3[data].values[test_index]

array([[ 3.        ,  1.        , 30.22905405, ..., 15.2458    ,
         7.        ,  0.        ],
       [ 3.        ,  1.        , 30.22905405, ..., 21.6792    ,
         7.        ,  0.        ],
       [ 3.        ,  0.        , 45.        , ..., 27.9       ,
         7.        ,  3.        ],
       ...,
       [ 3.        ,  1.        , 30.22905405, ...,  8.05      ,
         7.        ,  3.        ],
       [ 3.        ,  0.        , 30.22905405, ...,  7.7333    ,
         7.        ,  2.        ],
       [ 2.        ,  0.        , 20.        , ..., 36.75      ,
         7.        ,  3.        ]])

In [181]:
titanic_df3[data].values[train_index]

array([[ 2.        ,  0.        , 13.        , ..., 19.5       ,
         7.        ,  3.        ],
       [ 2.        ,  0.        ,  4.        , ..., 23.        ,
         7.        ,  3.        ],
       [ 2.        ,  0.        , 30.        , ..., 13.8583    ,
         7.        ,  0.        ],
       ...,
       [ 3.        ,  1.        , 30.22905405, ..., 69.55      ,
         7.        ,  3.        ],
       [ 3.        ,  0.        ,  6.        , ..., 31.275     ,
         7.        ,  3.        ],
       [ 1.        ,  1.        , 37.        , ..., 29.7       ,
         2.        ,  0.        ]])

In [193]:
titanic_df3['survived'].values[0:5] # 리스트 중 인덱스 0~4, 5개 추출

array([1, 1, 1, 0, 1], dtype=int64)

In [194]:
titanic_df3['survived'].values[[0,2,5]]

array([1, 1, 0], dtype=int64)

In [250]:
def exec_kfold(clf, k):
    acc_list = []
    kfold = KFold(n_splits=k)
    for i, (train_index, test_index) in enumerate(kfold.split(titanic_df3[data])):
        X_train2, X_test2 = titanic_df3[data].values[train_index], titanic_df3[data].values[test_index]
        y_train2, y_test2 = titanic_df3['survived'].values[train_index], titanic_df3['survived'].values[test_index]
        
        clf.fit(X_train2, y_train2)
        pred2 = clf.predict(X_test2)
        acc = accuracy_score(y_test2, pred2)
        acc_list.append(acc)
        print(i, ': 정확도>> {0:.2f}'.format(acc))
    print('평균 정확도>> {0:.2f}'.format(np.mean(acc_list)))

In [251]:
exec_kfold(df_clf, 10) 
# decision tree model을 가지고 crooss-validation(교차검증, k-fold)

0 : 정확도>> 0.83
1 : 정확도>> 0.77
2 : 정확도>> 0.77
3 : 정확도>> 0.73
4 : 정확도>> 0.84
5 : 정확도>> 0.71
6 : 정확도>> 0.78
7 : 정확도>> 0.79
8 : 정확도>> 0.80
9 : 정확도>> 0.73
평균 정확도>> 0.77


In [252]:
#svm, rf 비교(5, 10) => vudrbs

In [253]:
# model list로 교차검증이 한꺼번에 되게 코드를 변경
# model_list 중 평균값의 교차 검증 결과 중 최대인 모델의 평균값을 구하기
# 평균값이 제일 높은 모델을 선정
# -----------------------------------------------------------------------
# 교차 검증 결과 
# 제일 정확도가 높은 모델은 랜덤포레스트이고 정확도는 88입니다.

In [259]:
titanic_df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915 entries, 0 to 914
Data columns (total 9 columns):
pclass      915 non-null int64
survived    915 non-null int64
sex         915 non-null int32
age         915 non-null float64
sibsp       915 non-null int64
parch       915 non-null int64
fare        915 non-null float64
cabin       915 non-null int32
embarked    915 non-null int32
dtypes: float64(2), int32(3), int64(4)
memory usage: 53.7 KB


In [260]:
#int형이라 안될 때!
pd.get_dummies(titanic_df3.loc[:, ['sex', 'cabin', 'embarked']])

Unnamed: 0,sex,cabin,embarked
0,0,7,3
1,0,7,3
2,0,7,0
3,1,7,2
4,0,7,2
...,...,...,...
910,1,7,3
911,0,7,3
912,1,7,3
913,0,7,2


In [261]:
as_cate = titanic_df3[['sex', 'cabin', 'embarked']].astype('category')
as_cate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915 entries, 0 to 914
Data columns (total 3 columns):
sex         915 non-null category
cabin       915 non-null category
embarked    915 non-null category
dtypes: category(3)
memory usage: 3.5 KB


In [262]:
titanic_df3.drop(columns=['sex', 'cabin', 'embarked'], inplace=True)

In [263]:
titanic_df4 = pd.concat([titanic_df3, as_cate], axis=1)
titanic_df4.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex', 'cabin',
       'embarked'],
      dtype='object')

In [264]:
titanic_df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915 entries, 0 to 914
Data columns (total 9 columns):
pclass      915 non-null int64
survived    915 non-null int64
age         915 non-null float64
sibsp       915 non-null int64
parch       915 non-null int64
fare        915 non-null float64
sex         915 non-null category
cabin       915 non-null category
embarked    915 non-null category
dtypes: category(3), float64(2), int64(4)
memory usage: 46.4 KB


In [265]:
# category로 변경 후 다시!
# category로 변경하는 이유
# 데이터타입이 int형이면 원핫인코딩 불가
pd.get_dummies(titanic_df4)

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_0,sex_1,cabin_0,cabin_1,...,cabin_3,cabin_4,cabin_5,cabin_6,cabin_7,cabin_8,embarked_0,embarked_1,embarked_2,embarked_3
0,2,1,13.000000,0,1,19.5000,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,2,1,4.000000,1,1,23.0000,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,2,1,30.000000,1,0,13.8583,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,3,0,30.229054,0,0,7.7250,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
4,3,1,22.000000,0,0,7.7250,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,3,1,30.229054,0,0,7.7750,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
911,3,1,0.170000,1,2,20.5750,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
912,3,0,30.229054,0,0,8.0500,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
913,3,1,30.229054,0,0,7.7333,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [266]:
def exec_kfold2(df, clf, k):
    acc_list = []
    kfold = KFold(n_splits=k)
    for i, (train_index, test_index) in enumerate(kfold.split(df[data])):
        X_train2,X_test2 = df[data].values[train_index], df[data].values[test_index]
        y_train2,y_test2 = df['survived'].values[train_index],df['survived'].values[test_index]
        
        clf.fit(X_train2, y_train2)
        pred2 = clf.predict(X_test2)
        acc = accuracy_score(y_test2, pred2)
        acc_list.append(acc)
        print(i , ': 교차검증 정확도>> {0:.2f}'.format(acc))
    print('평균 정확도>> {0:.2f}'.format(np.mean(acc_list)))
    return np.mean(acc_list)

In [267]:
exec_kfold2(titanic_df4, rf_clf, 10)

0 : 교차검증 정확도>> 0.82
1 : 교차검증 정확도>> 0.75
2 : 교차검증 정확도>> 0.76
3 : 교차검증 정확도>> 0.73
4 : 교차검증 정확도>> 0.80
5 : 교차검증 정확도>> 0.73
6 : 교차검증 정확도>> 0.78
7 : 교차검증 정확도>> 0.80
8 : 교차검증 정확도>> 0.80
9 : 교차검증 정확도>> 0.77
평균 정확도>> 0.77


0.7737816531294792

In [268]:
acc_total = []
model_list = ['decision tree', 'random forest', 'svm']
for i, clf in enumerate(clf_list):
    print(model_list[i], '----검증시작----')
    acc_result = exec_kfold2(titanic_df4, clf, 10)
    acc_total.append(acc_result)

decision tree ----검증시작----
0 : 교차검증 정확도>> 0.83
1 : 교차검증 정확도>> 0.77
2 : 교차검증 정확도>> 0.77
3 : 교차검증 정확도>> 0.73
4 : 교차검증 정확도>> 0.84
5 : 교차검증 정확도>> 0.71
6 : 교차검증 정확도>> 0.78
7 : 교차검증 정확도>> 0.79
8 : 교차검증 정확도>> 0.80
9 : 교차검증 정확도>> 0.73
평균 정확도>> 0.77
random forest ----검증시작----
0 : 교차검증 정확도>> 0.82
1 : 교차검증 정확도>> 0.75
2 : 교차검증 정확도>> 0.76
3 : 교차검증 정확도>> 0.73
4 : 교차검증 정확도>> 0.80
5 : 교차검증 정확도>> 0.73
6 : 교차검증 정확도>> 0.78
7 : 교차검증 정확도>> 0.80
8 : 교차검증 정확도>> 0.80
9 : 교차검증 정확도>> 0.77
평균 정확도>> 0.77
svm ----검증시작----
0 : 교차검증 정확도>> 0.74
1 : 교차검증 정확도>> 0.73
2 : 교차검증 정확도>> 0.64
3 : 교차검증 정확도>> 0.62
4 : 교차검증 정확도>> 0.63
5 : 교차검증 정확도>> 0.59
6 : 교차검증 정확도>> 0.77
7 : 교차검증 정확도>> 0.69
8 : 교차검증 정확도>> 0.67
9 : 교차검증 정확도>> 0.67
평균 정확도>> 0.68


In [269]:
for i in range(3):
    print(model_list[i], " 교차검증 평균: ", acc_total[i])

decision tree  교차검증 평균:  0.7747969421882466
random forest  교차검증 평균:  0.7737816531294792
svm  교차검증 평균:  0.675430004777831


In [270]:
# 최대 평균값 ==
max_value = np.max(acc_total)
# 최대 평균값 인덱스 ==
max_index = np.argmax(acc_total)
# 최대 평균값 모델명 ==
max_model = model_list[max_index]

print('최고 교차 검증 평균>> ', max_value)
print('최고 교차 검증 인덱스>> ', max_index)
print('최고 교차 검증 모델링>> ', max_model)

최고 교차 검증 평균>>  0.7747969421882466
최고 교차 검증 인덱스>>  0
최고 교차 검증 모델링>>  decision tree
