In [28]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [29]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')
sub = pd.read_csv('./dataset/gender_submission.csv')

train.shape, test.shape, sub.shape

((891, 12), (418, 11), (418, 2))

In [30]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [32]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [33]:
train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(test['Age'].mean(), inplace=True)
train['Sex'] = LabelEncoder().fit_transform(train['Sex'])
test['Sex'] = LabelEncoder().fit_transform(test['Sex'])
train['Embarked'].fillna('ffill', inplace=True)
test['Embarked'].fillna('ffill', inplace=True)
train['Embarked'] = LabelEncoder().fit_transform(train['Embarked'])
test['Embarked'] = LabelEncoder().fit_transform(test['Embarked'])

In [34]:
print(train.isnull().sum())
print()
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [35]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [36]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,2
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,1
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,2
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,2


In [37]:
sel = ['Pclass', 'SibSp', 'Parch', 'Age', 'Sex', 'Embarked']

X = train[sel]
y = train['Survived']
X_sub = test[sel]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    stratify=train['Survived'],
    random_state=0
)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((668, 6), (668,), (223, 6), (223,))

In [38]:
depth_list = list(range(1, 11))

train_score = []
test_score = []
cv_score = []

kfold = KFold(n_splits=5, shuffle=True, random_state=0)

for d in depth_list:
    model = RandomForestClassifier(
        max_depth=d,
        n_jobs=-1,
        random_state=0
    )
    model.fit(X_train, y_train)

    train_score.append(model.score(X_train, y_train))
    test_score.append(model.score(X_test, y_test))
    cv_score.append(cross_val_score(model, X, y, cv=kfold).mean())

In [39]:
data = {
    'depth': depth_list,
    'train': train_score,
    'test': test_score,
    'cv_score': cv_score
}
df = pd.DataFrame(data)
df

Unnamed: 0,depth,train,test,cv_score
0,1,0.797904,0.789238,0.787854
1,2,0.80988,0.793722,0.791225
2,3,0.830838,0.798206,0.810313
3,4,0.841317,0.802691,0.815937
4,5,0.851796,0.802691,0.815937
5,6,0.865269,0.811659,0.818185
6,7,0.890719,0.816143,0.818178
7,8,0.908683,0.816143,0.810313
8,9,0.917665,0.820628,0.815931
9,10,0.928144,0.820628,0.815925


In [40]:
max_cv = df.loc[df['cv_score'] == df['cv_score'].max()]
max_cv

Unnamed: 0,depth,train,test,cv_score
5,6,0.865269,0.811659,0.818185


In [41]:
depth = max_cv.iloc[0, 0]
depth

6

In [42]:
model = RandomForestClassifier(max_depth=depth, n_jobs=-1, random_state=0)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, n_jobs=-1, random_state=0)

In [43]:
pred = model.predict(X_sub)
sub['Survived'] = pred
sub.to_csv('submission.csv', index=False)