### 타이타닉 생존자 예측

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


1. 데이터 전처리

- Feature selection

In [4]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'who', 'deck']]
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck
886,0,2,male,27.0,0,0,13.0,S,man,
887,1,1,female,19.0,0,0,30.0,S,woman,B
888,0,3,female,,1,2,23.45,S,woman,
889,1,1,male,26.0,0,0,30.0,C,man,C
890,0,3,male,32.0,0,0,7.75,Q,man,


- 결측치 처리

In [5]:
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
who           0
deck        688
dtype: int64

In [6]:
# age는 평균으로 대체
# 단, who column이 결측치가 없으므로 어른은 어른나이 평균, 아동은 아동나이 평균으로 대체
adult = df[df.who.isin(['man', 'woman'])]
adult_man = df[df.who == 'man']
adult_woman = df[df.who == 'woman']
amm = adult_man.age.mean().round(1)
awm = adult_woman.age.mean().round(1)
print(amm, awm)

33.2 32.0


In [7]:
df[df.who == 'man'].age.fillna(amm, inplace=True)
df[df.who == 'woman'].age.fillna(awm, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.who == 'man'].age.fillna(amm, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.who == 'woman'].age.fillna(awm, inplace=True)


In [8]:
df.age.fillna(adult.age.mean().round(1), inplace=True)

In [9]:
df.isna().sum()

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      2
who           0
deck        688
dtype: int64

In [10]:
# embarked 결측치를 최빈값으로 대체
df.embarked.value_counts()

embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [11]:
df.embarked.fillna('S', inplace=True)
df.embarked.isna().sum()

0

In [12]:
# deck 컬럼 삭제(결측치가 너무 많아서), who 컬럼도 삭제(age와 중복)
df.drop(columns=['deck', 'who'], inplace=True)

In [13]:
df.isna().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

- 카테고리형 데이터를 숫자로 변환

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [15]:
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)
df.tail(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
888,0,3,0,32.8,1,2,23.45,2
889,1,1,1,26.0,0,0,30.0,0
890,0,3,1,32.0,0,0,7.75,1


##### 2. 훈련/테스트 데이터로 분리

In [16]:
X = df.iloc[:, 1:].values
y = df.survived.values

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2023
)

##### 3. Random Forest로 학습

In [18]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=2023)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2023,
 'verbose': 0,
 'warm_start': False}

In [19]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.7821229050279329

##### 4. GridSearchCV로 수행

In [20]:
params = { 'max_depth':[6, 7, 8, 9, 10], 'min_samples_split':[2,3,4] }

In [21]:
from sklearn.model_selection import GridSearchCV

grid_rf = GridSearchCV(rfc, params, scoring='accuracy', cv=5 )
grid_rf.fit(X_train, y_train)

In [22]:
grid_rf.best_params_

{'max_depth': 9, 'min_samples_split': 3}

In [23]:
grid_rf.best_score_

0.8299911356249383

In [24]:
best_rfc = grid_rf.best_estimator_
best_rfc.score(X_test, y_test)

0.8100558659217877

4. 테스트 데이터 하나에 대해서 적용

In [25]:
test_data, pred = X_test[10], y_test[10]
test_data, pred

(array([ 2.    ,  1.    , 32.5   ,  1.    ,  0.    , 30.0708,  0.    ]), 0)

In [26]:
result = best_rfc.predict(test_data.reshape(1, -1))[0]
result

0

##### 5. Logistic Regression 모델

- 표준화

In [27]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
X_std[:5]

array([[ 0.82737724,  0.73769513, -0.63700389,  0.43279337, -0.47367361,
        -0.50244517,  0.58595414],
       [-1.56610693, -1.35557354,  0.58872284,  0.43279337, -0.47367361,
         0.78684529, -1.9423032 ],
       [ 0.82737724, -1.35557354, -0.3305722 , -0.4745452 , -0.47367361,
        -0.48885426,  0.58595414],
       [-1.56610693, -1.35557354,  0.35889908,  0.43279337, -0.47367361,
         0.42073024,  0.58595414],
       [ 0.82737724,  0.73769513,  0.35889908, -0.4745452 , -0.47367361,
        -0.48633742,  0.58595414]])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, stratify=y, test_size=0.2, random_state=2023
)

In [29]:
from sklearn.linear_model import LogisticRegression

xr = LogisticRegression(random_state=2023).fit(X_train, y_train)


In [30]:
xr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 2023,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [31]:
xr.score(X_test, y_test)

0.7486033519553073

- 정규화

In [32]:
from sklearn.preprocessing import MinMaxScaler
X_mn = MinMaxScaler().fit_transform(X)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    X_mn, y, stratify=y, test_size=0.2, random_state=2023
)

xr = LogisticRegression(random_state=2023).fit(X_train, y_train)
xr.score(X_test, y_test)

0.770949720670391

##### 6. 엉터리 분류기
- 여성이면 생존이라 예측, 그 외는 사망

In [34]:
df.pivot_table('survived', 'sex')

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
0,0.742038
1,0.188908


In [35]:
X[:3]

array([[ 3.    ,  1.    , 22.    ,  1.    ,  0.    ,  7.25  ,  2.    ],
       [ 1.    ,  0.    , 38.    ,  1.    ,  0.    , 71.2833,  0.    ],
       [ 3.    ,  0.    , 26.    ,  0.    ,  0.    ,  7.925 ,  2.    ]])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2023
)
X_train[:3]

array([[ 1.  ,  0.  , 35.  ,  1.  ,  0.  , 53.1 ,  2.  ],
       [ 3.  ,  1.  , 32.8 ,  0.  ,  0.  , 24.15,  1.  ],
       [ 2.  ,  1.  , 21.  ,  1.  ,  0.  , 11.5 ,  2.  ]])

In [37]:
from sklearn.base import BaseEstimator

# BaseEstimator를 상속해서 MyClassifier 클래스를 만들고
# fit(), predict() method를 재정의(override)
class MyCassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros(X.shape[0], int)
        for i in range(X.shape[0]):
            if X[i, 1] == 0.:       # 여성이면
                pred[i] = 1         # 생존으로 지정
        return pred 
    

In [38]:
my_clf = MyCassifier()
my_clf.fit(X_train, y_train)
pred_my = my_clf.predict(X_test)

In [39]:
y_test[:5], pred_my[:5]

(array([0, 1, 0, 0, 0], dtype=int64), array([0, 1, 0, 0, 0]))

In [40]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_my)

0.7653631284916201

##### 7. 오차 행렬(Confusion matrix)

In [41]:
pred = best_rfc.predict(X_test)

In [44]:
from sklearn.metrics import confusion_matrix

In [45]:
# Best RFC로 예측한 값의 오차 행렬
confusion_matrix(y_test, pred)

array([[96, 14],
       [20, 49]], dtype=int64)

In [46]:
# 엉터리 분류기로 예측한 값의 오차 행렬
confusion_matrix(y_test, pred_my)

array([[88, 22],
       [20, 49]], dtype=int64)

In [50]:
# 정확도(accuracy) 비교
accuracy_score(y_test, pred), accuracy_score(y_test, pred_my)

(0.8100558659217877, 0.7653631284916201)

In [49]:
from sklearn.metrics import precision_score, recall_score

In [52]:
# 정밀도(precision) 비교 
precision_score(y_test, pred), precision_score(y_test, pred_my)

(0.7777777777777778, 0.6901408450704225)

In [53]:
# 재현율(recall) 비교
recall_score(y_test, pred), recall_score(y_test, pred_my)

(0.7101449275362319, 0.7101449275362319)

In [55]:
# F1 score 비교
from sklearn.metrics import f1_score
f1_score(y_test, pred), f1_score(y_test, pred_my)

(0.7424242424242424, 0.7)

In [56]:
# AUC(Area under ROC curve) 비교
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred), roc_auc_score(y_test, pred_my)

(0.7914361001317525, 0.755072463768116)