In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
gender_submission = pd.read_csv("gender_submission.csv")

## Model: Gradient Boost

### 변수 처리 방법
- PassengerID : 제거
- Name: 모델링에서 사용 X
- SIbSp : integer 로 처리
- Parch: integer 로 처리
- Ticket : 해당 컬럼 자체는 사용하지 않고 추후 group 컬럼으로 unique count 해서 따로 만듦
- Pclass : integer 처리(1,2,3 그대로 냅둠)
- Age : 연속형 변수. null 값은 mean 으로 처리
- Cabin : 앞 알파벳만 따와서 categorical 로 사용
- Fare : 단체 티켓 가격 합쳐진 것은 1인당 가격으로 수정해서 사용. 연속형 변수
- Embarked : one-hot encoding / null 값은 drop

### 생성 변수
- group : 해당 ticket 번호를 가진 사람이 2명 이상일 경우 -> group / 아니면 0


### scaling
- X

In [3]:
# embarked null 인 값들 제거
train = train[train['Embarked'].isnull()==False]

# target  variable 따로 떼어냄
target = train["Survived"]

# 필요없는 변수/타겟 변수 제거
train.drop(['PassengerId','Name',"Survived"],1,inplace=True)

# Age null 값 처리
train['Age'].fillna(train['Age'].mean(),inplace=True)

In [5]:
#Group 변수 생성

#Family size랑 dup_count 중 큰걸로 fare 나누기.
train['Family_size'] = train['SibSp']+train['Parch']+1
train['dup_count'] = train.groupby(['Ticket'])['Ticket'].transform('count')
train['Group'] = ~((train['Family_size'] == 1) & (train['dup_count'] == 1))
train['Group_count'] = train[['Family_size','dup_count']].max(axis=1)
train['Fare_ind'] = train['Fare']/train['Group_count']

train['Cabin_alpha'] = train[train['Cabin'].isnull()==False]['Cabin'].map(lambda x: x[0])

In [6]:
train = train.drop(['Family_size','dup_count','Group_count',"Fare",'Ticket','Cabin'],1)
train['Cabin_alpha'].fillna("N",inplace=True)

In [7]:
train = pd.get_dummies(train, columns=['Sex', 'Embarked','Group','Cabin_alpha']).drop(['Sex_female','Group_False'],1)
## binary 인 sex/group 은 sex_female , group_false drop
train['Fare_ind'] = np.log1p(train["Fare_ind"])
train = train.reset_index().drop('index',1)

In [10]:
gb_model = GradientBoostingClassifier(random_state=0) 
params = {"n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]}
gs = GridSearchCV(gb_model,cv=5, param_grid=params)
gs.fit(train,target)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [5, 50, 250, 500], 'max_depth': [1, 3, 5, 7, 9], 'learning_rate': [0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
gs.best_score_

0.8278965129358831