In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
def encode_features(df):
    df['Cabin']=df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for f in features:
        le=LabelEncoder()
        le.fit(df[f])
        df[f]=le.transform(df[f])
    return df


#null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

#불필요한  feature 제거 함수
def drop_feature(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

def transform_features(df):
    df = fillna(df)
    df = drop_feature(df)
    df = encode_features(df)
    return df

df=pd.read_csv('titanic_train.csv')
y_df = df['Survived']
x_df = df.drop('Survived', axis=1)
x_df = transform_features(x_df)

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x_df,y_df,test_size=0.2, random_state=11 )

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

dt_clf=DecisionTreeClassifier()
rf_clf=RandomForestClassifier()
lr_clf=LogisticRegression()

#DecisionTreeClassifier
dt_clf.fit(x_train, y_train)
pred=dt_clf.predict(x_test)
print(f'#DecisionTreeClassifier accuracy_score(y_test, pred):{accuracy_score(y_test, pred)}')

#RandomForestClassifier
rf_clf.fit(x_train,y_train)
pred=rf_clf.predict(x_test)
print(f'#RandomForestClassifier accuracy_score(y_test, pred):{accuracy_score(y_test, pred)}')
      
#LogisticRegression
lr_clf.fit(x_train, y_train)
pred=lr_clf.predict(x_test)
print(f'#LogisticRegression accuracy_score(y_test, pred) : {accuracy_score(y_test, pred)}')      


In [7]:
x_df.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.0,1,0,7.25,7,3
1,1,0,38.0,1,0,71.2833,2,0
2,3,0,26.0,0,0,7.925,7,3


In [8]:
y_df.head(3)

0    0
1    1
2    1
Name: Survived, dtype: int64

In [9]:
#KFold
from sklearn.model_selection import KFold

def exec_kfold(clf, folds=5):
    kfold=KFold(n_splits=folds)
    scores=[]
    
    for iter_count, (train_index, test_index) in enumerate(kfold.split(x_df)):
        x_train, x_test = x_df.values[train_index], x_df.values[test_index]
        y_train, y_test = y_df.values[train_index], y_df.values[test_index]
        
        clf.fit(x_train, y_train)
        pred=clf.predict(x_test)
        accuracy = accuracy_score(y_test, pred)
        scores.append(accuracy)
        print(f'iter_count:{iter_count}, kfold accuracy : {accuracy:.4f}')
        
    mean_score = np.mean(scores)
    print(f'kfold mean_score : {mean_score:.4f}')

exec_kfold(dt_clf, folds=5)

iter_count:0, accuracy : 0.7486
iter_count:1, accuracy : 0.7584
iter_count:2, accuracy : 0.7865
iter_count:3, accuracy : 0.7584
iter_count:4, accuracy : 0.8371
mean_score : 0.7778


In [11]:
# cross_val_score ==> StratifiedKFold
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt_clf, x, y, cv=5)
for iter_count, accuracy in enumerate(scores):
    print(f'cross_val_score accuracy : {accuracy:.4f}')
    
print(f'mean_score : {np.mean(scores):.4f}')

cross_val_score accuracy : 0.7542
cross_val_score accuracy : 0.7877
cross_val_score accuracy : 0.7978
cross_val_score accuracy : 0.7865
cross_val_score accuracy : 0.8249
mean_score : 0.7902


In [14]:
from sklearn.model_selection import GridSearchCV

p={'max_depth' : [2,3,5,10], 'min_samples_split' : [2,3,5], 'min_samples_leaf':[1,5,8]}

grid_dclf = GridSearchCV(dt_clf, param_grid=p, scoring='accuracy', cv=5)
grid_dclf.fit(x_train, y_train)

# grid_dclf.cv_results_중에서
print(f'grid_dclf.best_params_ : \n{grid_dclf.best_params_}')
print(f'grid_dclf.best_score_ : {grid_dclf.best_score_}')

best_dclf = grid_dclf.best_estimator_
dpred=best_dclf.predict(x_test)
print(f'best_estimator_ accuracy_score(y_test, dpred) : {accuracy_score(y_test, dpred):.4f}')


grid_dclf.best_params_ : 
{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 5}
grid_dclf.best_score_ : 0.7991573033707865
best_estimator_ accuracy_score(y_test, dpred) : 0.8715


