In [208]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [209]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

In [210]:
train.shape
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [211]:
train["Age"].fillna(train["Age"].mean(), inplace=True)
train["Cabin"].fillna("N", inplace=True)
train["Embarked"].fillna("N", inplace=True)

In [212]:
train["Cabin"] = train["Cabin"].str[:1]

In [213]:
from sklearn import preprocessing

def endcode_features(dataDF):
    features = ["Cabin", "Sex", "Embarked"]
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
    return dataDF

train = endcode_features(train)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,7,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,2,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,7,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,2,3
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,7,3


In [214]:
def fillna(df):
    df["Age"].fillna(train["Age"].mean(), inplace=True)
    df["Cabin"].fillna("N", inplace=True)
    df["Embarked"].fillna("N", inplace=True)
    df["Fare"].fillna(0, inplace=True)
    return df

In [215]:
def drop_features(df):
    df.drop(["PassengerId", "Name", "Ticket"], axis=1, inplace=True)
    return df

In [216]:
def format_features(df):
    df["Cabin"] = df["Cabin"].str[:1]
    features = ["Cabin", "Sex", "Embarked"]
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

In [217]:
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df    

In [218]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

In [219]:
x = train.drop(columns=["Survived"], axis=1)
y = train["Survived"]

x = transform_features(x)
test_passengerId = test["PassengerId"]
test = transform_features(test)

In [220]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=11)

In [221]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import accuracy_score

In [222]:
dt = DecisionTreeClassifier(random_state=11)
rf = RandomForestClassifier(random_state=11)
lr = LogisticRegression()
lgb_model = lgb.LGBMClassifier(objective="binary", random_state=11)
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=11)

In [223]:
xgb_model.fit(x_train, y_train)
xgb_model_pred = xgb_model.predict(x_test)
print("XGBoost 정확도: {0: .4f}".format(accuracy_score(y_test, xgb_model_pred)))

XGBoost 정확도:  0.8771


In [224]:
lgb_model.fit(x_train, y_train)
lgb_model_pred = lgb_model.predict(x_test)
print("LighGBM 정확도: {0: .4f}".format(accuracy_score(y_test, lgb_model_pred)))

LighGBM 정확도:  0.8603


In [225]:
dt.fit(x_train, y_train)
dt_pred = dt.predict(x_test)
print("DecisionTree 정확도: {0: .4f}".format(accuracy_score(y_test, dt_pred)))

DecisionTree 정확도:  0.7877


In [226]:
rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)
print("RandomForest정확도: {0: .4f}".format(accuracy_score(y_test, rf_pred)))

RandomForest정확도:  0.8324




In [227]:
lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)
print("LogisticRegression 정확도: {0: .4f}".format(accuracy_score(y_test, lr_pred)))

LogisticRegression 정확도:  0.8659




In [228]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = 'accuracy'

In [229]:
from sklearn.model_selection import GridSearchCV

parameters = {"max_depth": [2, 3, 5, 10],
             "min_samples_split": [2, 3, 5, 10, 15, 20, 25], "min_samples_leaf": [1, 5, 8]}

grid = GridSearchCV(lgb_model, param_grid=parameters, scoring="accuracy", cv=5)
grid.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31,
                                      objective='binary', random_state=11,
                                      reg_alpha=0.0, reg_lambda=0.0,
                                      silent=True, subsample=1.0,
                                      subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [2, 3,

In [230]:
print("최적 하이퍼 파라미터", grid.best_params_)
print("최고 정확도: {0: .4f}".format(grid.best_score_))
best_df = grid.best_estimator_

최적 하이퍼 파라미터 {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
최고 정확도:  0.8202


In [231]:
predictions = best_df.predict(x_test)
accuracy = accuracy_score(y_test, predictions)
print("정확도 {0: .4f}".format(accuracy))

정확도  0.8827


In [232]:
result = best_df.predict(test)

In [233]:
submission = pd.DataFrame({
    "PassengerId": test_passengerId,
    "Survived": result
})

In [235]:
submission.to_csv("0328_book_xgboost_{0: .4f}.csv".format(88.27), index=False)