# Random Forest Model
## 변수 처리 방법
- Passenger ID : 제거
- Pclass : Ordinal 변수이므로 그대로 채용.
- Sex : Label-Encode
- Age : 그대로 (Mean으로 결측값 Impute)
- Sibsp, Parch : 그대로
- Ticket : 제거
- Fare : 그대로, 단 Ticket변수와 Sibsp+parch 사용하여 그룹멤버들만큼 나눠서 1인당 요금으로 계산
- Cabin : Binary로 변환
- Embarked : one-hot encoding with NaN imputed as 'C'. (NaN승객들과 같은 Pclass 승객들의 Fare mean으로 C인것을 유추)

## 생성변수
- Group : Ticket 변수가 unique하지 않거나 Sibsp+parch > 0 일때 1, 아닐때 0을 가지는 단체승객여부 변수.

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
import seaborn as sns

In [29]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [57]:
# Import Dataset
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

#결측치 처리 (Age, Embarked)
df_train['Age'].fillna(df_train['Age'].mean(),inplace=True)
df_test['Age'].fillna(df_train['Age'].mean(),inplace=True)
df_train['Embarked'].fillna('C',inplace=True)

#Cabin 결측치 처리 겸 변수 변환
#df_train['Cabin'] = df_train.Cabin.apply(lambda x: 1 if pd.notnull(x) else 0)
#df_test['Cabin'] = df_test.Cabin.apply(lambda x: 1 if pd.notnull(x) else 0)
df_train['Cabin_alpha'] = df_train[df_train['Cabin'].isnull()==False]['Cabin'].map(lambda x: x[0])
df_train['Cabin_alpha'].fillna("N",inplace=True)

#Group 변수 생성
#Family size랑 dup_count 중 큰걸로 fare 나누기.
df_train['Family_size'] = df_train['SibSp']+df_train['Parch']+1
df_train['dup_count'] = df_train.groupby(['Ticket'])['Ticket'].transform('count')
df_train['Group'] = ~((df_train['Family_size'] == 1) & (df_train['dup_count'] == 1))
df_train['Group_count'] = df_train[['Family_size','dup_count']].max(axis=1)
df_train['Fare_ind'] = df_train['Fare']/df_train['Group_count']

#Fare 결측치 처리 in test
df_test['Fare'].fillna(df_test['Fare'].mean(),inplace=True)

df_test['Family_size'] = df_test['SibSp']+df_test['Parch']+1
df_test['dup_count'] = df_test.groupby(['Ticket'])['Ticket'].transform('count')
df_test['Group'] = ~((df_test['Family_size'] == 1) & (df_test['dup_count'] == 1))
df_test['Group_count'] = df_test[['Family_size','dup_count']].max(axis=1)
df_test['Fare_ind'] = df_test['Fare']/df_test['Group_count']

# Categorical Feature Encoding
le = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le.fit(df_train['Embarked'])
le2.fit(df_train['Cabin_alpha'])
df_train['Embarked']=le.transform(df_train['Embarked'])
df_train['Cabin_alpha']=le2.transform(df_train['Cabin_alpha'])
df_train['Sex'] = df_train['Sex'].apply(lambda x: 1 if x == 'male' else 0)
df_test['Sex'] = df_test['Sex'].apply(lambda x: 1 if x == 'male' else 0)
#df_train = pd.get_dummies(df_train, columns = ['Embarked'],drop_first=True,prefix='', prefix_sep='')
#df_test = pd.get_dummies(df_test, columns = ['Embarked'],drop_first=True,prefix='', prefix_sep='')

In [51]:
df_train.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_alpha,Family_size,dup_count,Group,Group_count,Fare_ind
PassengerId,1.0,-0.005007,-0.035144,0.042939,0.033207,-0.057527,-0.001652,0.012658,0.013078,-0.03308,-0.040143,0.003355,-0.027972,-0.017532,0.000257
Survived,-0.005007,1.0,-0.338481,-0.543351,-0.069809,-0.035322,0.081629,0.257307,-0.174199,-0.295113,0.016639,0.038247,0.241717,0.061099,0.237628
Pclass,-0.035144,-0.338481,1.0,0.1319,-0.331339,0.083081,0.018443,-0.5495,0.170334,0.742093,0.065997,-0.002633,-0.218834,0.018783,-0.661215
Sex,0.042939,-0.543351,0.1319,1.0,0.084153,-0.114631,-0.245489,-0.182333,0.115513,0.118635,-0.200988,-0.154748,-0.31886,-0.19149,-0.103677
Age,0.033207,-0.069809,-0.331339,0.084153,1.0,-0.232625,-0.179191,0.091566,-0.035479,-0.249098,-0.248512,-0.221281,-0.163989,-0.231035,0.224387
SibSp,-0.057527,-0.035322,0.083081,-0.114631,-0.232625,1.0,0.414838,0.159651,0.070653,0.041058,0.890712,0.661622,0.492459,0.809574,-0.083311
Parch,-0.001652,0.081629,0.018443,-0.245489,-0.179191,0.414838,1.0,0.216225,0.042325,-0.031553,0.783111,0.593076,0.491554,0.701479,-0.039441
Fare,0.012658,0.257307,-0.5495,-0.182333,0.091566,0.159651,0.216225,1.0,-0.229304,-0.525742,0.217138,0.345541,0.404686,0.334278,0.842068
Embarked,0.013078,-0.174199,0.170334,0.115513,-0.035479,0.070653,0.042325,-0.229304,1.0,0.21225,0.069434,0.048359,-0.102461,0.051963,-0.265487
Cabin_alpha,-0.03308,-0.295113,0.742093,0.118635,-0.249098,0.041058,-0.031553,-0.525742,0.21225,1.0,0.012298,-0.034495,-0.192767,-0.015821,-0.598094


In [53]:
train_X.describe()

Unnamed: 0,Pclass,Sex,Age,Embarked,Cabin_alpha,Group_count,Fare_ind
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.647587,29.699118,1.531987,5.946128,2.103255,15.463672
std,0.836071,0.47799,13.002015,0.794531,2.062347,1.68886,18.187769
min,1.0,0.0,0.42,0.0,0.0,1.0,0.0
25%,2.0,0.0,22.0,1.0,7.0,1.0,7.2396
50%,3.0,1.0,29.699118,2.0,7.0,1.0,8.05
75%,3.0,1.0,35.0,2.0,7.0,3.0,14.775
max,3.0,1.0,80.0,2.0,8.0,11.0,221.7792


In [58]:
#모델에 사용하지 않을 변수 제거
train_X = df_train.drop(['PassengerId','Survived','Name','Cabin','Ticket','Fare','Family_size','dup_count','SibSp','Parch'],axis=1)
test_X = df_test.drop(['PassengerId','Name','Ticket','Fare','Cabin','Family_size','dup_count','Group_count'],axis=1)
train_y = df_train.Survived

In [59]:
rf = RandomForestClassifier(random_state=0)
parameters = {'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]}
clf = RandomizedSearchCV(rf, parameters,n_iter = 100, cv=5,random_state=0)
clf.fit(train_X, train_y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [60]:
clf.best_params_

{'n_estimators': 300,
 'min_samples_split': 8,
 'min_samples_leaf': 3,
 'max_features': 3,
 'max_depth': 90}

In [61]:
clf.best_score_

0.835016835016835

In [62]:
train_X

Unnamed: 0,Pclass,Sex,Age,Embarked,Cabin_alpha,Group,Group_count,Fare_ind
0,3,1,22.000000,2,7,True,2,3.625000
1,1,0,38.000000,0,2,True,2,35.641650
2,3,0,26.000000,2,7,False,1,7.925000
3,1,0,35.000000,2,2,True,2,26.550000
4,3,1,35.000000,2,7,False,1,8.050000
5,3,1,29.699118,1,7,False,1,8.458300
6,1,1,54.000000,2,4,False,1,51.862500
7,3,1,2.000000,2,7,True,5,4.215000
8,3,0,27.000000,2,7,True,3,3.711100
9,2,0,14.000000,0,7,True,2,15.035400
