In [3]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm

In [4]:
# data import
train = pd.read_csv('../data/titanic/train.csv', index_col='PassengerId')
test = pd.read_csv('../data/titanic/test.csv', index_col='PassengerId')
submission = pd.read_csv('../data/titanic/gender_submission.csv', index_col='PassengerId')


print(train.shape, test.shape, submission.shape)

(891, 11) (418, 10) (418, 1)


---
### Cabin
---

In [5]:
# 반복문을 통해 객실번호의 알파벳과 숫자 분리 후, 알파벳만 뽑아오기
train_test_data = [train,test]
for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]

In [6]:
# classifier를 위해 매핑
# feature scaling : raw data 전처리하는 과정 (feature들의 크기, 범위 정규화)/ 소수점 사용
# 숫자의 범위가 비슷하지 않으면 먼 거리에 있는 데이터를 조금 더 중요하게 생각할 수 있음 주의

cabin_mapping = {'A':0, 'B':0.4, 'C':0.8, 'D':1.2, 'E':1.6, 'F':2, 'G':2.4, 'T': 2.8}

for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

In [7]:
# Cabin의 missing field는 1등급 2등급 3등급 클래스와 밀접한 관계
# # fillna
train['Cabin'].fillna(
        train.groupby('Pclass')['Cabin'].transform('median')
    ,   inplace=True
)
test['Cabin'].fillna(
        test.groupby('Pclass')['Cabin'].transform('median')
    ,   inplace=True
)
train.isnull().sum(), test.isnull().sum()
# train.tail(10)

(Survived      0
 Pclass        0
 Name          0
 Sex           0
 Age         177
 SibSp         0
 Parch         0
 Ticket        0
 Fare          0
 Cabin         0
 Embarked      2
 dtype: int64,
 Pclass       0
 Name         0
 Sex          0
 Age         86
 SibSp        0
 Parch        0
 Ticket       0
 Fare         1
 Cabin        0
 Embarked     0
 dtype: int64)

---
### 성별
---

In [8]:
train.loc[train['Sex']=='male', 'Sex']=0
train.loc[train['Sex']=='female','Sex']=1
test.loc[test['Sex']=='male','Sex']=0
test.loc[test['Sex']=='female','Sex']=1

---
### Pclass
---

In [9]:
train['Pclass_3']=(train['Pclass']==3)
train['Pclass_2']=(train['Pclass']==2)
train['Pclass_1']=(train['Pclass']==1)

test['Pclass_3']=(test['Pclass']==3)
test['Pclass_2']=(test['Pclass']==2)
test['Pclass_1']=(test['Pclass']==1)

In [10]:
train=train.drop(columns='Pclass')
test=test.drop(columns='Pclass')

---
### Fare
---

In [11]:
test.loc[test['Fare'].isnull(),'Fare']=0

---
### Age
---

---
### FamilySize
---

In [12]:
train['FamilySize']=train['SibSp']+train['Parch']+1
test['FamilySize']=test['SibSp']+test['Parch']+1

---
### Single
---

In [13]:
train['Single']=train['FamilySize']==1
train['Nuclear']=(2<=train['FamilySize']) & (train['FamilySize']<=4)
train['Big']=train['FamilySize']>=5

test['Single']=test['FamilySize']==1
test['Nuclear']=(2<=test['FamilySize']) & (test['FamilySize']<=4)
test['Big']=test['FamilySize']>=5

In [14]:
train=train.drop(columns=['Single','Big','SibSp','Parch','FamilySize','Cabin','Age'])
test=test.drop(columns=['Single','Big','SibSp','Parch','FamilySize','Cabin','Age'])

In [15]:
train['EmbarkedC']=train['Embarked']=='C'
train['EmbarkedS']=train['Embarked']=='S'
train['EmbarkedQ']=train['Embarked']=='Q'
test['EmbarkedC']=test['Embarked']=='C'
test['EmbarkedS']=test['Embarked']=='S'
test['EmbarkedQ']=test['Embarked']=='Q'

train=train.drop(columns='Embarked')
test=test.drop(columns='Embarked')

In [16]:
train['Name']=train['Name'].str.split(', ').str[1].str.split('. ').str[0]
test['Name']=test['Name'].str.split(', ').str[1].str.split('. ').str[0]

In [17]:
train['Master']=(train['Name']=='Master')
test['Master']=(test['Name']=='Master')

train=train.drop(columns='Name')
test=test.drop(columns='Name')

train=train.drop(columns='Ticket')
test=test.drop(columns='Ticket')

In [18]:
train.head()

Unnamed: 0_level_0,Survived,Sex,Fare,Pclass_3,Pclass_2,Pclass_1,Nuclear,EmbarkedC,EmbarkedS,EmbarkedQ,Master
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,0,7.25,True,False,False,True,False,True,False,False
2,1,1,71.2833,False,False,True,True,True,False,False,False
3,1,1,7.925,True,False,False,False,False,True,False,False
4,1,1,53.1,False,False,True,True,False,True,False,False
5,0,0,8.05,True,False,False,False,False,True,False,False


In [19]:
train_data = train.drop('Survived',axis=1)
target = train['Survived']

train_data.shape, target.shape

((891, 10), (891,))

---
### 모델링
---

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
import xgboost as xgb 
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [21]:
# for depth in range(1,100):
# rf_clf = RandomForestClassifier()
# rf_clf.fit(train_data , target)
# predictions = rf_clf.predict(test)

In [22]:
model=DecisionTreeClassifier()
# random_state is an arbitrary number.
model.fit(train_data, target)
predictions=model.predict(test)

In [24]:
submission1 = pd.read_csv('../data/submission_a.csv')
del submission1['PassengerId']
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predictions, submission1)
accuracy

0.7822966507177034

In [None]:
acc_list=[]

for r in range(1):
    clf = DecisionTreeClassifier(max_depth=11
                                , min_samples_leaf=1 
                                # , min_samples_split=r
                                # ,max_features=r
                                , random_state=1
                                )
    clf.fit(train_data, target) # 학습
    Y_pred = clf.predict(test) # 테스트 데이터로 예측값 추출    
    
    accuracy = accuracy_score(Y_pred, submission1)
    print(r, accuracy)
    acc_list.append(accuracy)

acc_list.index(max(acc_list)),max(acc_list)

0 0.7990430622009569


(0, 0.7990430622009569)

In [52]:
acc_list=[]

for r in range(1):
    clf = RandomForestClassifier(n_estimators=370
                                 , max_depth=4
                                 #, min_samples_split=r
                                 , min_samples_leaf=5
                                 , random_state=352
                                 , n_jobs=-1)
    clf.fit(train_data, target) # 학습
    Y_pred = clf.predict(test) # 테스트 데이터로 예측값 추출    
    
    accuracy = accuracy_score(Y_pred, submission1)
    print(r, accuracy)
    acc_list.append(accuracy)

acc_list.index(max(acc_list)),max(acc_list)

0 0.80622009569378


(0, 0.80622009569378)