In [73]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [74]:
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import precision_score,recall_score,accuracy_score,confusion_matrix

### 0. 데이터 로드

In [75]:
train=pd.read_csv("datasets/train.csv")
test_X=pd.read_csv("datasets/test.csv")
test_y=pd.read_csv("datasets/gender_submission.csv")["Survived"]

### 1. 데이터 관찰

In [76]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [77]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### 2. 데이터 전처리

In [78]:
# 정답레이블과 입력 데이터로 나누기
train_X=train.drop("Survived",axis=1)
train_y=train["Survived"].copy()

In [79]:
# 필요 없는 열을 제거하는 변환기
class DataFrameRemover(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_list):
        self.attribute_list=attribute_list
    def fit(self,X):
        return self
    def transform(self,X):
        return X.drop(self.attribute_list,axis=1) # Pandas Dataframe으로 반환

In [80]:
# 범주형 데이터를 one-hot-encoding 하는 데 쓰는 변환기
class MyOneHotEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_list):
        self.attribute_list=attribute_list
        self.one_hot_list=[]
    def fit(self,X):
        for attribute in self.attribute_list:
            one_hot_data=pd.get_dummies(X[attribute],prefix=attribute)
            self.one_hot_list.append(one_hot_data)
        return self
    def transform(self,X):
        new_X=X.drop(self.attribute_list,axis=1)
        new_X=pd.concat([new_X]+self.one_hot_list,axis=1)
        
        return new_X # pandas Dataframe으로 반환

In [81]:
# 전체 파이프라인 
preprocessing_pipeline=Pipeline([
    ('data_remover',DataFrameRemover(["PassengerId","Name","Ticket","Cabin"])), # 불필요한 특성 제거
    ('one_hot_encoder',MyOneHotEncoder(["Sex","Embarked"])), # 범주형 데이터 인코딩
    ('imputer',SimpleImputer(strategy="median")), # 누락된 행 전처리
    ('scaler',MinMaxScaler()) # 특성 스케일링
])

In [82]:
preprocessed_train_X=preprocessing_pipeline.fit_transform(train_X)

In [83]:
# 전처리 결과
pd.DataFrame(preprocessed_train_X) 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.271174,0.125,0.000000,0.014151,0.0,1.0,0.0,0.0,1.0
1,0.0,0.472229,0.125,0.000000,0.139136,1.0,0.0,1.0,0.0,0.0
2,1.0,0.321438,0.000,0.000000,0.015469,1.0,0.0,0.0,0.0,1.0
3,0.0,0.434531,0.125,0.000000,0.103644,1.0,0.0,0.0,0.0,1.0
4,1.0,0.434531,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,0.5,0.334004,0.000,0.000000,0.025374,0.0,1.0,0.0,0.0,1.0
887,0.0,0.233476,0.000,0.000000,0.058556,1.0,0.0,0.0,0.0,1.0
888,1.0,0.346569,0.125,0.333333,0.045771,1.0,0.0,0.0,0.0,1.0
889,0.0,0.321438,0.000,0.000000,0.058556,0.0,1.0,1.0,0.0,0.0


### 3. 모델 훈련, 평가

In [84]:
sgd_clf=SGDClassifier(max_iter=100,random_state=42)
val_scores=cross_val_score(sgd_clf,preprocessed_train_X,train_y,scoring="accuracy",cv=5)

In [85]:
# validation score
val_scores

array([0.73184358, 0.68715084, 0.79213483, 0.80898876, 0.81355932])

In [86]:
# training score
sgd_clf.fit(preprocessed_train_X,train_y)
pred_train_y=sgd_clf.predict(preprocessed_train_X)
train_score=accuracy_score(train_y,pred_train_y)

In [87]:
train_score

0.7946127946127947

In [88]:
# 오차행렬 : 행은 실제 클래스, 열은 예측한 클래스
confusion_matrix(train_y,pred_train_y)

array([[472,  77],
       [106, 236]], dtype=int64)

### 4. 모델 튜닝

In [89]:
param_grid=\
    {'loss':['hinge','log','modified_huber','squared_hinge','perceptron'],
     'penalty':['l1','l2','elasticnet'],
     'early_stopping':[True,False],
     'class_weight':['balanced',None]
    }
grid_search=GridSearchCV(sgd_clf,param_grid,cv=5,scoring="accuracy",return_train_score=True, n_jobs=-1)
grid_search.fit(preprocessed_train_X,train_y)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=100,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5, random_state=42,
                                     shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'early_stopping': [True, False],
                         'loss': ['hinge', 'log', 'modified_huber',
                 

In [90]:
grid_search.best_params_

{'class_weight': None,
 'early_stopping': False,
 'loss': 'hinge',
 'penalty': 'l1'}

In [91]:
sgd_clf=SGDClassifier(loss='hinge',penalty='l1',max_iter=100,random_state=42)
sgd_clf.fit(preprocessed_train_X,train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=100, n_iter_no_change=5, n_jobs=None, penalty='l1',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [100]:
# 튜닝된 모델로 훈련 데이터셋 다시 예측
pred_train_y=sgd_clf.predict(preprocessed_train_X)
train_score=accuracy_score(train_y,pred_train_y)
train_score

0.8114478114478114

### 5. 테스트 데이터로 평가

In [92]:
preprocessed_test_X=preprocessing_pipeline.fit_transform(test_X)

In [93]:
pd.DataFrame(preprocessed_test_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.452723,0.0,0.0,0.015282,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.617566,0.125,0.0,0.013663,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.5,0.815377,0.0,0.0,0.018909,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.353818,0.0,0.0,0.016908,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.287881,0.125,0.111111,0.023984,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [94]:
preprocessing_pipeline=Pipeline([
    ('data_remover',DataFrameRemover(["PassengerId","Name","Ticket","Cabin"])),
    ('one_hot_encoder',MyOneHotEncoder(["Sex","Embarked"])),
    ('imputer',SimpleImputer(strategy="median")),
    ('scaler',MinMaxScaler())
])

In [95]:
preprocessed_test_X=preprocessing_pipeline.fit_transform(test_X)

In [96]:
pd.DataFrame(preprocessed_test_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.452723,0.0,0.0,0.015282,0.0,1.0,0.0,1.0,0.0
1,1.0,0.617566,0.125,0.0,0.013663,1.0,0.0,0.0,0.0,1.0
2,0.5,0.815377,0.0,0.0,0.018909,0.0,1.0,0.0,1.0,0.0
3,1.0,0.353818,0.0,0.0,0.016908,0.0,1.0,0.0,0.0,1.0
4,1.0,0.287881,0.125,0.111111,0.023984,1.0,0.0,0.0,0.0,1.0


In [97]:
pred_test_y=sgd_clf.predict(preprocessed_test_X)

In [98]:
confusion_matrix(test_y,pred_test_y)

array([[262,   4],
       [ 18, 134]], dtype=int64)

In [99]:
# test scores
print(precision_score(test_y,pred_test_y)) # 높은 정밀도 : 1로 예측된 것 중 실제 1이 많음
print(recall_score(test_y,pred_test_y)) # 낮은 재현율 : 실제 1 중에서 0으로 예측된 것이 많음? -> 즉 0으로 예측하는 경향이 있는 모델?
print(accuracy_score(test_y,pred_test_y)) ### ??? 테스트 데이터셋에서 정확도가 높은 이유는...????

0.9710144927536232
0.881578947368421
0.9473684210526315


### 6. 분석

In [101]:
# 실제로 죽은 사람 549, 실제로 산 사람 332명
confusion_matrix(train_y,pred_train_y)

array([[497,  52],
       [116, 226]], dtype=int64)

In [102]:
# 실제로 죽은 사람 264명, 실제로 산 사람 152명
confusion_matrix(test_y,pred_test_y)

array([[262,   4],
       [ 18, 134]], dtype=int64)

훈련과 시험 데이터셋에서 산사람과 죽은 사람의 비율은 비슷한데..?