### 타이타닉 생존자 예측

In [30]:
import numpy as np
import pandas as pd
import seaborn as sns

In [31]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


1. 데이터 전처리

In [32]:
df['class'].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

- Feature selection

In [33]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'who', 'deck']]
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
who           0
deck        688
dtype: int64

In [34]:
man_mean = df.age[df.who == 'man'].mean().round()
woman_mean = df.age[df.who == 'woman'].mean().round()
child_mean = df.age[df.who == 'child'].mean().round()
man_mean, woman_mean, child_mean

(33.0, 32.0, 6.0)

In [35]:
df.age.fillna(man_mean, inplace=True)

In [36]:
df.age.isna().sum()

0

In [37]:
# embarked 결측치를 최빈값으로
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [38]:
df.embarked.fillna('S', inplace=True)
df.embarked.isna().sum()

0

In [39]:
# deck column 삭제, who 컬럼도 age와 중복되므로 삭제
df.drop(columns=['deck', 'who'], axis=1, inplace=True)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [40]:
df.isna().sum().sum()

0

- 카테고리형 데이터를 숫자로 변환

In [42]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


2. 훈련/테스트 데이터로 분리

In [43]:
X = df.iloc[:, 1:].values
y = df.survived.values

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2023
)

In [50]:
X_train.shape, X_test.shape, y_train.shape

((712, 7), (179, 7), (712,))

3. Random Forest로 학습

In [52]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [53]:
rfc.fit(X_train, y_train)

In [54]:
rfc.score(X_test, y_test)

0.7877094972067039

4. GridSearchCV로 수행

In [55]:
params = {
    'max_depth': [2,5,8],
    'min_samples_split':[2,3,4]
}

In [56]:
from sklearn.model_selection import GridSearchCV
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

In [58]:
grid_rf.best_params_

{'max_depth': 8, 'min_samples_split': 2}

In [57]:
grid_rf.score(X_test, y_test)

0.8044692737430168

In [60]:
params = {
    'max_depth': [6,7,8,9,10],
    'min_samples_split':[2,3,4]
}
grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5)
grid_rf.fit(X_train, y_train)

In [61]:
grid_rf.best_params_

{'max_depth': 7, 'min_samples_split': 4}

In [64]:
grid_rf.best_score_

0.8271939328277356

In [65]:
best_rfc = grid_rf.best_estimator_
best_rfc.score(X_test, y_test)

0.8268156424581006

4. 테스트 데이터 하나에 대해서 적용

In [66]:
test_data, pred = X_test[10], y_test[10]
test_data, pred

(array([ 2.    ,  1.    , 32.5   ,  1.    ,  0.    , 30.0708,  0.    ]), 0)

In [68]:
result = best_rfc.predict(test_data.reshape(1,-1))[0]
result

0

5. Logistic regression 모델

- 표준화

In [70]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler().fit_transform(X)

array([[ 0.82737724,  0.73769513, -0.63966511, ..., -0.47367361,
        -0.50244517,  0.58595414],
       [-1.56610693, -1.35557354,  0.58532919, ..., -0.47367361,
         0.78684529, -1.9423032 ],
       [ 0.82737724, -1.35557354, -0.33341653, ..., -0.47367361,
        -0.48885426,  0.58595414],
       ...,
       [ 0.82737724, -1.35557354,  0.20251847, ...,  2.00893337,
        -0.17626324,  0.58595414],
       [-1.56610693,  0.73769513, -0.33341653, ..., -0.47367361,
        -0.04438104, -1.9423032 ],
       [ 0.82737724,  0.73769513,  0.12595633, ..., -0.47367361,
        -0.49237783, -0.67817453]])

In [73]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(
    std, y, stratify=y, random_state=2023, test_size=0.2
)
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.7486033519553073

- 정규화

In [74]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    mms, y, stratify=y, random_state=2023, test_size=0.2
)
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.770949720670391

6. 엉터리 분류기
    - 여성이면 생존이라 예측, 그 외는 사망

In [75]:
df.pivot_table('survived', 'sex')

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
0,0.742038
1,0.188908


In [76]:
X[:3]

array([[ 3.    ,  1.    , 22.    ,  1.    ,  0.    ,  7.25  ,  2.    ],
       [ 1.    ,  0.    , 38.    ,  1.    ,  0.    , 71.2833,  0.    ],
       [ 3.    ,  0.    , 26.    ,  0.    ,  0.    ,  7.925 ,  2.    ]])

In [77]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2023
)
X_train[:3]

array([[ 1.  ,  0.  , 35.  ,  1.  ,  0.  , 53.1 ,  2.  ],
       [ 3.  ,  1.  , 33.  ,  0.  ,  0.  , 24.15,  1.  ],
       [ 2.  ,  1.  , 21.  ,  1.  ,  0.  , 11.5 ,  2.  ]])

In [78]:
from sklearn.base import BaseEstimator

# BaseEstimator를 상속해서 MyClassifier 클래스를 만들고
# fit(), predict() method를 재정의(override)
class MyClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros(X.shape[0], int)
        for i in range(X.shape[0]):
            if X[i, 1] == 0:        # 여성이면
                pred[i] = 1         # 생존으로 지정
        return pred

In [79]:
my_clf = MyClassifier()
my_clf.fit(X_train, y_train)
pred_my = my_clf.predict(X_test)

In [81]:
y_test[:5], pred_my[:5]

(array([0, 1, 0, 0, 0], dtype=int64), array([0, 1, 0, 0, 0]))

In [82]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_my)

0.7653631284916201

7. 오차 행렬(Confusion matrix)

In [83]:
pred = best_rfc.predict(X_test)

In [84]:
from sklearn.metrics import confusion_matrix

In [85]:
# Best RFC로 예측한 값의 오차 행렬
confusion_matrix(y_test, pred)

array([[99, 11],
       [20, 49]], dtype=int64)

In [86]:
# 엉터리 분류기로 예측한 값의 오차 행렬
confusion_matrix(y_test, pred_my)

array([[88, 22],
       [20, 49]], dtype=int64)

In [91]:
from sklearn.metrics import precision_score, recall_score
# 정확도(accuracy) 비교
accuracy_score(y_test, pred), accuracy_score(y_test, pred_my)

(0.8268156424581006, 0.7653631284916201)

In [92]:
# 정밀도(precision) 비교
precision_score(y_test, pred), accuracy_score(y_test, pred_my)

(0.8166666666666667, 0.7653631284916201)

In [95]:
# 재현율(recall) 비교
recall_score(y_test, pred), recall_score(y_test, pred_my)

(0.7101449275362319, 0.7101449275362319)

In [97]:
# F1 score 비교
from sklearn.metrics import f1_score
f1_score(y_test, pred), f1_score(y_test, pred_my)

(0.7596899224806202, 0.7)

In [99]:
# AUC(Area Under ROC curve) 비교
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred), roc_auc_score(y_test, pred_my)

(0.8050724637681159, 0.755072463768116)