In [275]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [276]:
titanic_train=pd.read_csv("datasets/train.csv")
titanic_test_X=pd.read_csv("datasets/test.csv")
titanic_test_y=pd.read_csv("datasets/gender_submission.csv")

# 1. 데이터 훑어보기

In [277]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- Survived: 타깃입니다. 0은 생존하지 못한 것이고 1은 생존을 의미합니다.
- Pclass: 승객 등급. 1, 2, 3등석.
- Name, Sex, Age: 이름 그대로 의미입니다.
- SibSp: 함께 탑승한 형제, 배우자의 수.
- Parch: 함께 탑승한 자녀, 부모의 수.
- Ticket: 티켓 아이디
- Fare: 티켓 요금 (파운드)
- Cabin: 객실 번호
- Embarked: 승객이 탑승한 곳. C(Cherbourg), Q(Queenstown), S(Southampton)

In [278]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [279]:
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# 2. 데이터 전처리

In [280]:
# 정답레이블과 입력 데이터로 나누기
train_X=titanic_train.drop("Survived",axis=1)
train_y=titanic_train["Survived"].copy()

In [281]:
def split_X_y(dataset):
    X=dataset.drop("Survived",axis=1)
    y=dataset["Survived"].copy
    return X,y

### 1) 불필요한 열 제거

In [282]:
# 불필요한 정보 제거
temp_X=train_X.drop(["PassengerId","Name","Ticket","Cabin"],axis=1)

In [283]:
temp_X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [284]:
# 변환기
from sklearn.base import BaseEstimator,TransformerMixin
class DataFrameRemover(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_list):
        self.attribute_list=attribute_list
    def fit(self,X):
        return self
    def transform(self,X):
        return X.drop(self.attribute_list,axis=1) # Pandas Dataframe으로 반환

In [285]:
# 변환 성공
new_temp_X=train_X.copy()

tf=DataFrameRemover(["PassengerId","Name","Ticket","Cabin"])
tf.fit(new_temp_X)
new_temp_X=tf.transform(new_temp_X)
new_temp_X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


### 2) 범주형 데이터 One-hot-encoding

In [286]:
# 범주형 데이터 One-hot-encoding
temp_X_Sex=pd.get_dummies(train_X["Sex"])
temp_X_Embarked=pd.get_dummies(temp_X["Embarked"],prefix="Embarked")

In [287]:
temp_X_Sex

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [288]:
temp_X_Embarked

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [289]:
temp_X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [290]:
temp_X=temp_X.drop(["Sex","Embarked"],axis=1)
temp_X=pd.concat([temp_X,temp_X_Sex,temp_X_Embarked],axis=1)
temp_X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.9250,1,0,0,0,1
3,1,35.0,1,0,53.1000,1,0,0,0,1
4,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,1,0,0,1
887,1,19.0,0,0,30.0000,1,0,0,0,1
888,3,,1,2,23.4500,1,0,0,0,1
889,1,26.0,0,0,30.0000,0,1,1,0,0


In [291]:
#변환기

class MyOneHotEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_list):
        self.attribute_list=attribute_list
        self.one_hot_list=[]
    def fit(self,X):
        #temp_X_Embarked=pd.get_dummies(temp_X["Embarked"],prefix="Embarked")
        for attribute in self.attribute_list:
            one_hot_data=pd.get_dummies(X[attribute],prefix=attribute)
            self.one_hot_list.append(one_hot_data)
        return self
    def transform(self,X):
        #temp_X=temp_X.drop(["Sex","Embarked"],axis=1)
        #temp_X=pd.concat([temp_X,temp_X_Sex,temp_X_Embarked],axis=1)
        new_X=X.drop(self.attribute_list,axis=1)
        new_X=pd.concat([new_X]+self.one_hot_list,axis=1)
        
        return new_X # pandas Dataframe으로 반환

In [292]:
# 변환 성공
ohe=MyOneHotEncoder(["Sex","Embarked"])
ohe.fit(new_temp_X)
ohe.transform(new_temp_X)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.9250,1,0,0,0,1
3,1,35.0,1,0,53.1000,1,0,0,0,1
4,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,1,0,0,1
887,1,19.0,0,0,30.0000,1,0,0,0,1
888,3,,1,2,23.4500,1,0,0,0,1
889,1,26.0,0,0,30.0000,0,1,1,0,0


### 3) 누락된 값에 대한 전처리

In [293]:
# 누락된 값 채워주기
from sklearn.impute import SimpleImputer

imputer=SimpleImputer(strategy="median")
imputer.fit(temp_X)
temp_X=imputer.transform(temp_X) # numpy 행렬 반환

In [294]:
pd.DataFrame(temp_X).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
0    891 non-null float64
1    891 non-null float64
2    891 non-null float64
3    891 non-null float64
4    891 non-null float64
5    891 non-null float64
6    891 non-null float64
7    891 non-null float64
8    891 non-null float64
9    891 non-null float64
dtypes: float64(10)
memory usage: 69.7 KB


In [295]:
# 특성 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
#scaler.fit(pd.DataFrame(temp_X))
#temp_X=scaler.transform(pd.DataFrame(temp_X))
scaler.fit(temp_X)
temp_X=scaler.transform(temp_X) # numpy 행렬 반환

In [296]:
pd.DataFrame(temp_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.271174,0.125,0.0,0.014151,0.0,1.0,0.0,0.0,1.0
1,0.0,0.472229,0.125,0.0,0.139136,1.0,0.0,1.0,0.0,0.0
2,1.0,0.321438,0.0,0.0,0.015469,1.0,0.0,0.0,0.0,1.0
3,0.0,0.434531,0.125,0.0,0.103644,1.0,0.0,0.0,0.0,1.0
4,1.0,0.434531,0.0,0.0,0.015713,0.0,1.0,0.0,0.0,1.0


### 변환 파이프라인

In [297]:
from sklearn.pipeline import Pipeline

full_pipeline=Pipeline([
    ('data_remover',DataFrameRemover(["PassengerId","Name","Ticket","Cabin"])),
    ('one_hot_encoder',MyOneHotEncoder(["Sex","Embarked"])),
    ('imputer',SimpleImputer(strategy="median")),
    ('scaler',MinMaxScaler())
])

In [298]:
preprocessed_X=full_pipeline.fit_transform(train_X)

In [299]:
pd.DataFrame(preprocessed_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.271174,0.125,0.0,0.014151,0.0,1.0,0.0,0.0,1.0
1,0.0,0.472229,0.125,0.0,0.139136,1.0,0.0,1.0,0.0,0.0
2,1.0,0.321438,0.0,0.0,0.015469,1.0,0.0,0.0,0.0,1.0
3,0.0,0.434531,0.125,0.0,0.103644,1.0,0.0,0.0,0.0,1.0
4,1.0,0.434531,0.0,0.0,0.015713,0.0,1.0,0.0,0.0,1.0


# 모델 선택, 훈련, 평가

In [300]:
from sklearn.linear_model import SGDClassifier
sgd_clf=SGDClassifier(max_iter=100,random_state=42)
sgd_clf.fit(preprocessed_X,train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=100, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [301]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(sgd_clf,preprocessed_X,train_y,scoring="accuracy",cv=5)

In [302]:
scores 

array([0.73184358, 0.68715084, 0.79213483, 0.80898876, 0.81355932])

In [303]:
from sklearn.model_selection import cross_val_predict
predictions=cross_val_predict(sgd_clf,preprocessed_X,train_y,cv=5)

In [304]:
predictions ###??? 결과가 이러한 이유좀....?

array([0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,

In [305]:
sgd_clf.predict(preprocessed_X)

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,

In [306]:
from sklearn.model_selection import GridSearchCV

param_grid=\
    {'loss':['hinge','log','modified_huber','squared_hinge','perceptron'],
     'penalty':['l1','l2','elasticnet'],
     'early_stopping':[True,False],
     'class_weight':['balanced',None]
    }
gird_search=GridSearchCV(sgd_clf,param_grid,cv=5,scoring="accuracy",return_train_score=True, n_jobs=-1)
gird_search.fit(preprocessed_X,train_y)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=100,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5, random_state=42,
                                     shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'early_stopping': [True, False],
                         'loss': ['hinge', 'log', 'modified_huber',
                 

In [307]:
random_search.best_params_

{'penalty': 'elasticnet',
 'loss': 'log',
 'early_stopping': False,
 'class_weight': None}

In [308]:
sgd_clf=SGDClassifier(loss='log',penalty='elasticnet',max_iter=100,random_state=42)
sgd_clf.fit(preprocessed_X,train_y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [309]:
scores=cross_val_score(sgd_clf,preprocessed_X,train_y,scoring="accuracy",cv=5)
scores

array([0.77653631, 0.77094972, 0.79213483, 0.78089888, 0.72316384])

# 모델 평가 (testing)

In [310]:
test_X,test_y=titanic_test_X,titanic_test_y["Survived"]
preprocessed_test_X=full_pipeline.fit_transform(test_X)

In [311]:
pd.DataFrame(preprocessed_test_X).head() # 결과가 이렇게 나오는 이유좀..?

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.452723,0.0,0.0,0.015282,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.617566,0.125,0.0,0.013663,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.5,0.815377,0.0,0.0,0.018909,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.353818,0.0,0.0,0.016908,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.287881,0.125,0.111111,0.023984,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [312]:
full_pipeline=Pipeline([
    ('data_remover',DataFrameRemover(["PassengerId","Name","Ticket","Cabin"])),
    ('one_hot_encoder',MyOneHotEncoder(["Sex","Embarked"])),
    ('imputer',SimpleImputer(strategy="median")),
    ('scaler',MinMaxScaler())
])

In [313]:
preprocessed_test_X=full_pipeline.fit_transform(test_X)
pd.DataFrame(preprocessed_test_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.452723,0.000,0.000000,0.015282,0.0,1.0,0.0,1.0,0.0
1,1.0,0.617566,0.125,0.000000,0.013663,1.0,0.0,0.0,0.0,1.0
2,0.5,0.815377,0.000,0.000000,0.018909,0.0,1.0,0.0,1.0,0.0
3,1.0,0.353818,0.000,0.000000,0.016908,0.0,1.0,0.0,0.0,1.0
4,1.0,0.287881,0.125,0.111111,0.023984,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
413,1.0,0.353818,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0
414,0.0,0.512066,0.000,0.000000,0.212559,1.0,0.0,1.0,0.0,0.0
415,1.0,0.505473,0.000,0.000000,0.014151,0.0,1.0,0.0,0.0,1.0
416,1.0,0.353818,0.000,0.000000,0.015713,0.0,1.0,0.0,0.0,1.0


In [314]:
from sklearn.metrics import precision_score,recall_score,accuracy_score

sgd_clf=SGDClassifier(loss='log',penalty='elasticnet',max_iter=100,random_state=42)
sgd_clf.fit(preprocessed_X,train_y)
pred_y=sgd_clf.predict(preprocessed_test_X)

In [315]:
print(precision_score(pred_y,test_y))
print(recall_score(pred_y,test_y))
print(accuracy_score(pred_y,test_y))

0.8947368421052632
0.9927007299270073
0.9593301435406698


결정 임계값 조절 필요한듯...