# Gradient Boosting Model

## 변수 처리 방법

* Passenger ID : 제거
* Pclass : Categorical 변수에 포함되는 Ordinal 변수이며, 계급이 많지 않으므로 카테고리 인코딩 후 사용
* Sex : Label - Encode
* Age : 결측값을 Mean으로 대체
* Sibsp, Parch : 처리 없음
* Ticket : Group변수 생성 이후 삭제 
* Fare : 동일 Ticket값을 가진 인원수로 나누어 1인요금 산출 후 삭제
* Cabin : 앞 알파벳만 이용해 CabinCode생성 후 삭제
* Embarked : one-hot encoding (NaN값은 2개뿐이므로 삭제)

## 생성 변수

* Group : 동행인 여부를 나타냄. 동일 티켓으로 승선한 다른 사람이 있거나, (Sibsp+Parch > 0)이면 1 아니면 0
* CabinCode : Cabin의 앞 알파벳 정보

In [216]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', None)


### Import Data

In [226]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


80.0

### Preprocessing

In [227]:
def zscore(series):
    result = (series-series.mean())/series.std()
    return result

def minmax(series):
    result = (series-series.min())/(series.max()-series.min())
    return result

def preprocessor(df, normalizer = zscore):
    # Missing Value handling (Age, Embarked)
    print("BEFORE\n")
    print(df.isnull().sum())

    df.Age.fillna(df.Age.mean(),inplace=True)
    df = df[df.Embarked.notna()]

    print('\nAFTER\n')
    print(df.isnull().sum())

    # Make new variable(Group)
    df['FamilySize'] = df.SibSp + df.Parch + 1 # include self
    df['DupCnt'] = df.groupby(['Ticket']).Ticket.transform('count')
    df['Group'] = ~((df.FamilySize==1)&(df.DupCnt==1))
    df['GroupCnt'] = df[['FamilySize','DupCnt']].max(axis=1)
    df['FareInd'] = df.Fare/df.GroupCnt

    df['CabinCode'] = df[df.Cabin.notna()].Cabin.map(lambda x:x[0]) # extract cabin code

    # Categorical Feature Encding
    df['SexMale'] = df.Sex.map(lambda x : False if x == 'Female' else True) # female = 0 , male = 1
    df = pd.get_dummies(df, columns = ['Pclass', 'Embarked', 'CabinCode'])

    # log transformation (FareInd)
    df['FareInd'] = np.log1p(df['FareInd'])
    
    # Normalization (All of Numeric variables)
    df['Age'] = normalizer(df['Age'])
    df['SibSp'] = normalizer(df['SibSp'])
    df['Parch'] = normalizer(df['Parch'])
    df['FareInd'] = normalizer(df['FareInd'])
    
    # Drop useless columns
    df = df.drop(columns = ['PassengerId','Sex', 'Name','Ticket','Fare','Cabin', 'FamilySize', 'DupCnt', 'GroupCnt'])
    return df

In [228]:
train = preprocessor(train)
test = preprocessor(test)
train.head()

BEFORE

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

AFTER

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64
BEFORE

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

AFTER

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Unnamed: 0,Survived,Age,SibSp,Parch,Group,FareInd,SexMale,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,CabinCode_A,CabinCode_B,CabinCode_C,CabinCode_D,CabinCode_E,CabinCode_F,CabinCode_G,CabinCode_T
0,0,-0.590163,0.431108,-0.474059,True,-1.266273,True,0,0,1,0,0,1,0,0,0,0,0,0,0,0
1,1,0.643609,0.431108,-0.474059,True,1.495543,True,1,0,0,1,0,0,0,0,1,0,0,0,0,0
2,1,-0.28172,-0.474932,-0.474059,False,-0.389066,True,0,0,1,0,0,1,0,0,0,0,0,0,0,0
3,1,0.412277,0.431108,-0.474059,True,1.114996,True,1,0,0,0,0,1,0,0,1,0,0,0,0,0
4,0,0.412277,-0.474932,-0.474059,False,-0.370507,True,0,0,1,0,0,1,0,0,0,0,0,0,0,0


In [229]:
train_x = train.drop(columns = ['Survived'])
train_y = train['Survived']

test_x = test
test_y = pd.read_csv('gender_submission.csv').Survived

In [230]:
test.describe()

Unnamed: 0,Age,SibSp,Parch,FareInd,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,CabinCode_A,CabinCode_B,CabinCode_C,CabinCode_D,CabinCode_E,CabinCode_F,CabinCode_G
count,418.0,418.0,418.0,417.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,3.8246920000000005e-17,-1.381139e-17,-5.5245550000000004e-17,9.904148000000001e-17,0.255981,0.222488,0.521531,0.244019,0.110048,0.645933,0.016746,0.043062,0.083732,0.0311,0.021531,0.019139,0.002392
std,1.0,1.0,1.0,1.0,0.436934,0.416416,0.500135,0.430019,0.313324,0.478803,0.128474,0.20324,0.277317,0.173797,0.14532,0.137177,0.048912
min,-2.382564,-0.4988722,-0.3997686,-3.363215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.5756121,-0.4988722,-0.3997686,-0.5585289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,-0.4988722,-0.3997686,-0.4113413,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.4335268,0.6162539,-0.3997686,0.6982772,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.61924,8.422137,8.770534,3.859113,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [231]:
gb_model = GradientBoostingClassifier(random_state=0) 
params = {"n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]}
gs = GridSearchCV(gb_model,cv=5, param_grid=params)
gs.fit(train_x,train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [5, 50, 250, 500], 'max_depth': [1, 3, 5, 7, 9], 'learning_rate': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [232]:
gs.best_score_

0.734533183352081

In [233]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}

In [234]:
gs.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              n_iter_no_change=None, presort='auto', random_state=0,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)