In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_csv('train.csv', header=0)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [4]:
df_train['Embarked'].value_counts()

S    644
C    168
Q     77
dtype: int64

In [5]:
df_train[df_train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28,


## fill 'Embarked' NaN values with the most common value from the rest of the data

In [6]:
df_train['Embarked'].mode().values[0]

'S'

In [7]:
df_train['Embarked'].fillna(value=df_train['Embarked'].mode().values[0], inplace=True)
df_train[df_train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


### make `Sex` and `Embarked` numerical values...

In [8]:
df_train['Sex'].value_counts()

male      577
female    314
dtype: int64

In [9]:
df_train['Gender'] = df_train['Sex'].map({'male': 1, 'female': 0})
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,1


In [10]:
df_train['Gender'].value_counts()

1    577
0    314
dtype: int64

In [11]:
df_train['Embarked'].value_counts()

S    646
C    168
Q     77
dtype: int64

In [12]:
df_train['Embarked_numeric'] = df_train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
df_train['Embarked_numeric'].value_counts()

2    646
0    168
1     77
dtype: int64

### fill missing `age` values with median `age`

In [13]:
df_train[df_train['Age'].isnull()].size

2478

In [14]:
df_train[df_train['Age'].notnull()].size

9996

In [15]:
median_age = df_train[df_train['Age'].notnull()]['Age'].median()
median_age

28.0

In [16]:
# set (row, column) for all Age.isnull() values to median_age
df_train.fillna(median_age, inplace=True)
# df_train.loc[df_train['Age'].isnull(), 'Age'] = median_age
df_train[df_train['Age'].isnull()].size

0

In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId         891 non-null int64
Survived            891 non-null int64
Pclass              891 non-null int64
Name                891 non-null object
Sex                 891 non-null object
Age                 891 non-null float64
SibSp               891 non-null int64
Parch               891 non-null int64
Ticket              891 non-null object
Fare                891 non-null float64
Cabin               891 non-null object
Embarked            891 non-null object
Gender              891 non-null int64
Embarked_numeric    891 non-null int64
dtypes: float64(2), int64(7), object(5)
memory usage: 104.4+ KB


### drop columns that are still incomplete or won't be used...

In [18]:
df_train.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId', 'Embarked'], axis=1, inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived            891 non-null int64
Pclass              891 non-null int64
Age                 891 non-null float64
SibSp               891 non-null int64
Parch               891 non-null int64
Fare                891 non-null float64
Gender              891 non-null int64
Embarked_numeric    891 non-null int64
dtypes: float64(2), int64(6)
memory usage: 62.6 KB


## munging test data in same way

In [19]:
df_test = pd.read_csv('test.csv', header=0)

# map Sex male/female to 1/0
df_test['Gender'] = df_test['Sex'].map({'male': 1, 'female': 0})

# fillna values with most common Embarked value
df_test['Embarked'].fillna(value=df_test['Embarked'].mode().values[0], inplace=True)

# map Embarked C/Q/S to 0/1/2
df_test['Embarked_numeric'] = df_test['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# fill missing age values 
df_test['Age'].fillna(df_test['Age'].median(), inplace=True)
df_test = df_test[pd.notnull(df_test)]

# fill missing fare values with median fare of passenger class
pclass_median_fare_dict = {}
for pclass in df_test['Pclass'].unique():
    pclass_median_fare_dict[pclass] = df_test[df_test['Pclass'] == pclass]['Fare'].median()

print(pclass_median_fare_dict)

for i in np.where(df_test['Fare'].isnull())[0]:
    pclass = df_test.loc[i, 'Pclass']
    df_test.loc[i, ['Fare']] = pclass_median_fare_dict[pclass]

test_passenger_ids = df_test['PassengerId'].values

df_test.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId', 'Embarked'], axis=1, inplace=True)
df_test.info()

{1: 60.0, 2: 15.75, 3: 7.8958}
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass              418 non-null int64
Age                 418 non-null float64
SibSp               418 non-null int64
Parch               418 non-null int64
Fare                418 non-null float64
Gender              418 non-null int64
Embarked_numeric    418 non-null int64
dtypes: float64(2), int64(5)
memory usage: 26.1 KB


In [20]:
df_test.iloc[152]

Pclass               3.0000
Age                 60.5000
SibSp                0.0000
Parch                0.0000
Fare                 7.8958
Gender               1.0000
Embarked_numeric     2.0000
Name: 152, dtype: float64

In [21]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived            891 non-null int64
Pclass              891 non-null int64
Age                 891 non-null float64
SibSp               891 non-null int64
Parch               891 non-null int64
Fare                891 non-null float64
Gender              891 non-null int64
Embarked_numeric    891 non-null int64
dtypes: float64(2), int64(6)
memory usage: 62.6 KB


In [22]:
df_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_numeric
0,3,34.5,0,0,7.8292,1,1
1,3,47.0,1,0,7.0,0,2
2,2,62.0,0,0,9.6875,1,1
3,3,27.0,0,0,8.6625,1,2
4,3,22.0,1,1,12.2875,0,2


In [23]:
df_test.tail()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_numeric
413,3,27.0,0,0,8.05,1,2
414,1,39.0,0,0,108.9,0,0
415,3,38.5,0,0,7.25,1,2
416,3,27.0,0,0,8.05,1,2
417,3,27.0,1,1,22.3583,1,0


In [24]:
df_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Gender,Embarked_numeric
0,0,3,22,1,0,7.25,1,2
1,1,1,38,1,0,71.2833,0,0
2,1,3,26,0,0,7.925,0,2
3,1,1,35,1,0,53.1,0,2
4,0,3,35,0,0,8.05,1,2


In [25]:
df_train.values

array([[  0.    ,   3.    ,  22.    , ...,   7.25  ,   1.    ,   2.    ],
       [  1.    ,   1.    ,  38.    , ...,  71.2833,   0.    ,   0.    ],
       [  1.    ,   3.    ,  26.    , ...,   7.925 ,   0.    ,   2.    ],
       ..., 
       [  0.    ,   3.    ,  28.    , ...,  23.45  ,   0.    ,   2.    ],
       [  1.    ,   1.    ,  26.    , ...,  30.    ,   1.    ,   0.    ],
       [  0.    ,   3.    ,  32.    , ...,   7.75  ,   1.    ,   1.    ]])

In [26]:
train_data = df_train.values
test_data = df_test.values

In [27]:
train_data

array([[  0.    ,   3.    ,  22.    , ...,   7.25  ,   1.    ,   2.    ],
       [  1.    ,   1.    ,  38.    , ...,  71.2833,   0.    ,   0.    ],
       [  1.    ,   3.    ,  26.    , ...,   7.925 ,   0.    ,   2.    ],
       ..., 
       [  0.    ,   3.    ,  28.    , ...,  23.45  ,   0.    ,   2.    ],
       [  1.    ,   1.    ,  26.    , ...,  30.    ,   1.    ,   0.    ],
       [  0.    ,   3.    ,  32.    , ...,   7.75  ,   1.    ,   1.    ]])

In [28]:
train_data[0::,1::]

array([[  3.    ,  22.    ,   1.    , ...,   7.25  ,   1.    ,   2.    ],
       [  1.    ,  38.    ,   1.    , ...,  71.2833,   0.    ,   0.    ],
       [  3.    ,  26.    ,   0.    , ...,   7.925 ,   0.    ,   2.    ],
       ..., 
       [  3.    ,  28.    ,   1.    , ...,  23.45  ,   0.    ,   2.    ],
       [  1.    ,  26.    ,   0.    , ...,  30.    ,   1.    ,   0.    ],
       [  3.    ,  32.    ,   0.    , ...,   7.75  ,   1.    ,   1.    ]])

In [29]:
train_data[0::,0]

array([ 0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,
        0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [30]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=1000)
forest = forest.fit(train_data[0::,1::], train_data[0::,0])

In [31]:
output = forest.predict(test_data).astype(int)
output

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [32]:
test_passenger_ids.shape

(418,)

In [33]:
output.shape

(418,)

In [34]:
with open('randomforest_sklearn.csv', 'w') as f:
    f.write('PassengerId,Survived\n')
    for pid, status in zip(test_passenger_ids, output):
        f.write(str(pid)+','+str(status)+'\n')

In [35]:
!head genderbasedmodel.csv
!echo
!head genderclassmodel.csv
!echo
!head randomforest_sklearn.csv

PassengerID,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1

PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1

PassengerId,Survived
892,0
893,0
894,0
895,1
896,0
897,0
898,0
899,0
900,1
