# Imports

In [1]:
import pandas as pd
import numpy as np

raw_train = pd.read_csv("train.csv")
raw_test = pd.read_csv("test.csv")

raw_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Exploratory Data Analysis

In [32]:
print('Pclass')
print('Range: ' + str(raw_test['Pclass'].min()) + '-' + str(raw_test['Pclass'].max()))
print('Missing Values: ' + str(raw_test['Pclass'].size - raw_test['Pclass'].count()))
print()
print('Name')
print('Missing Values: ' + str(raw_test['Embarked'].size - raw_test['Embarked'].count()))
print()
print('Sex')
print('Missing Values: ' + str(raw_test['Sex'].size - raw_test['Sex'].count()))
print()
print('Age')
print('Range: ' + str(raw_test['Age'].min()) + '-' + str(raw_test['Age'].max()))
print('Missing Values: ' + str(raw_test['Age'].size - raw_test['Age'].count()))
print()
print('SibSp')
print('Range: ' + str(raw_test['SibSp'].min()) + '-' + str(raw_test['SibSp'].max()))
print('Missing Values: ' + str(raw_test['SibSp'].size - raw_test['SibSp'].count()))
print()
print('Parch')
print('Range: ' + str(raw_test['Parch'].min()) + '-' + str(raw_test['Parch'].max()))
print('Missing Values: ' + str(raw_test['Parch'].size - raw_test['Parch'].count()))
print()
print('Fare')
print('Range: ' + str(raw_test['Fare'].min()) + '-' + str(raw_test['Fare'].max()))
print('Missing Values: ' + str(raw_test['Fare'].size - raw_test['Fare'].count()))
print()
print('Embarked')
print('Missing Values: ' + str(raw_test['Embarked'].size - raw_test['Embarked'].count()))

Pclass
Range: 1-3
Missing Values: 0

Name
Missing Values: 0

Sex
Missing Values: 0

Age
Range: 0.17-76.0
Missing Values: 86

SibSp
Range: 0-8
Missing Values: 0

Parch
Range: 0-9
Missing Values: 0

Fare
Range: 0.0-512.3292
Missing Values: 1

Embarked
Missing Values: 0


In [3]:
age = raw_train['Age'].floordiv(10)
age.index = age
print(age.mean())
age.groupby(level=0).count()

2.5252100840336134


Unnamed: 0_level_0,Age
Age,Unnamed: 1_level_1
0.0,62
1.0,102
2.0,220
3.0,167
4.0,89
5.0,48
6.0,19
7.0,6
8.0,1


In [33]:
fare = raw_train['Fare'].floordiv(50)
fare.index = fare
print(fare.mean())
fare.groupby(level=0).count()

0.3221099887766554


Unnamed: 0_level_0,Fare
Fare,Unnamed: 1_level_1
0.0,730
1.0,108
2.0,24
3.0,9
4.0,11
5.0,6
10.0,3


# Preprocessing

In [19]:
def transform(data,age_bin = 10,fare_bin = 50):
  titles = ["Mr", "Miss", "Mrs", "Master"]
  title = data['Name'].str.extract(r"([A-Za-z]+)\.", expand=False)
  title.replace(["Ms", "Mlle", "Mme"], "Miss", inplace=True)
  title.replace(["Lady"], "Mrs", inplace=True)
  title.replace(["Sir", "Rev"], "Mr", inplace=True)
  title[~title.isin(titles)] = "Others"
  title = title.map({'Mr':0,'Miss':1,'Mrs':2,'Master':3,'Others':4})
  sex = data['Sex'].map({'male':0,'female':1,np.nan:1})
  age = data['Age'].floordiv(age_bin,fill_value=data['Age'].mean())
  family = data['SibSp'] + data['Parch']
  alone = family == 0
  alone = alone.map({False:0,True:1})
  fare = data['Fare'].floordiv(fare_bin,fill_value=0)
  embarked = data['Embarked'].map({'S':0,'C':1,'Q':2})
  cleaned = pd.DataFrame({'Pclass':data['Pclass'],'Title':title,'Sex':sex,'Age':age,'Alone':alone,'Fare':fare,'Embarked':embarked})
  return cleaned

transform(raw_train)

Unnamed: 0,Pclass,Title,Sex,Age,Alone,Fare,Embarked
0,3,0,0,2.0,0,0.0,0.0
1,1,2,1,3.0,0,1.0,1.0
2,3,1,1,2.0,1,0.0,0.0
3,1,2,1,3.0,0,1.0,0.0
4,3,0,0,3.0,1,0.0,0.0
...,...,...,...,...,...,...,...
886,2,0,0,2.0,1,0.0,0.0
887,1,1,1,1.0,1,0.0,0.0
888,3,1,1,2.0,0,0.0,0.0
889,1,0,0,2.0,1,0.0,1.0


# Training

In [5]:
def entropy(data):
  p1 = data['Survived'].mean();
  p0 = 1 - p1
  if p0 == 0 or p1 == 0: return 0
  return -1 * (p0*np.log2(p0) + p1*np.log2(p1))

entropy(raw_train)

np.float64(0.9607079018756469)

In [20]:
def train(data,labels,max_depth,min_improv=0,random=False):
  if(max_depth > 0):
    H = entropy(data)
    if(H > 0):
      if not random:
        min = 2
        for l in labels:
          groupby = data.groupby(by=l)
          e = 0
          for value, group in groupby:
            e += entropy(group) * len(group)/len(data)
          if e < min:
            min = e
            label = l
            g = groupby
        if min < H - min_improv:
          labels.remove(label)
          next = dict();
          for value, group in g:
            next.update({value:train(group,labels,max_depth-1)})
          return {'label':label,'entropy':H,'next':next}
      else:
        i = np.random.randint(len(labels))
        l = labels[i]
        groupby = data.groupby(by=l)
        labels.remove(l)
        next = dict();
        for value, group in groupby:
          next.update({value:train(group,labels,max_depth-1)})
        return {'label':l,'entropy':H,'next':next}
  return {'label':'End','entropy':entropy(data),'class': 0 if data['Survived'].mean() < .5 else 1}

In [21]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
model = train(data,labels,len(labels))
model

{'label': 'Title',
 'entropy': np.float64(0.9607079018756469),
 'next': {0: {'label': 'Pclass',
   'entropy': np.float64(0.6258424755265808),
   'next': {1: {'label': 'Age',
     'entropy': np.float64(0.9357110512652843),
     'next': {1.0: {'label': 'Fare',
       'entropy': np.float64(0.8112781244591328),
       'next': {1.0: {'label': 'End', 'entropy': 0, 'class': 0},
        2.0: {'label': 'End', 'entropy': np.float64(1.0), 'class': 1},
        5.0: {'label': 'End', 'entropy': 0, 'class': 0}}},
      2.0: {'label': 'Alone',
       'entropy': np.float64(0.9494520153879484),
       'next': {0: {'label': 'Embarked',
         'entropy': np.float64(0.9910760598382222),
         'next': {0.0: {'label': 'End',
           'entropy': np.float64(0.9182958340544896),
           'class': 0},
          1.0: {'label': 'End', 'entropy': np.float64(1.0), 'class': 1}}},
        1: {'label': 'End',
         'entropy': np.float64(0.9293636260137187),
         'class': 0}}},
      3.0: {'label': 'Sex'

In [22]:
def trainForest(data,max_depth, n_trees, frac=.5, random=True):
  models = []
  for i in range(n_trees):
    d = data.sample(frac=frac,replace=True)
    labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
    models += [train(d,labels,max_depth,random=random)]
  return models

In [23]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
models = trainForest(data,len(labels),10)
models

[{'label': 'Embarked',
  'entropy': np.float64(0.9766650017367726),
  'next': {0.0: {'label': 'Title',
    'entropy': np.float64(0.9461282363800092),
    'next': {0: {'label': 'Age',
      'entropy': np.float64(0.7029786560827124),
      'next': {1.0: {'label': 'End', 'entropy': 0, 'class': 0},
       2.0: {'label': 'Pclass',
        'entropy': np.float64(0.6321302781219706),
        'next': {1: {'label': 'Fare',
          'entropy': np.float64(0.8904916402194913),
          'next': {0.0: {'label': 'End',
            'entropy': np.float64(0.9910760598382222),
            'class': 0},
           1.0: {'label': 'End', 'entropy': 0, 'class': 0},
           4.0: {'label': 'End', 'entropy': 0, 'class': 0}}},
         2: {'label': 'End', 'entropy': 0, 'class': 0},
         3: {'label': 'Alone',
          'entropy': np.float64(0.6912898694057888),
          'next': {0: {'label': 'End', 'entropy': 0, 'class': 0},
           1: {'label': 'End',
            'entropy': np.float64(0.75537541256142

# Testing

In [26]:
def testForest(data,models):
  correct = 0
  for i in data.index:
    vote = 0
    for model in models:
      layer = model
      while not layer['label'] == 'End':
        value = data[layer['label']][i]
        if value in layer['next']:
          layer = layer['next'][value]
        else:
          keys = list(layer['next'].keys())
          before = keys[0]
          for k in keys:
            after = k
            if(after > value): break
            before = k
          if after - value < value - before:
            layer = layer['next'][after]
          else:
            layer = layer['next'][before]
      vote += layer['class']
    vote = (0 if vote < 5 else 1)
    if vote == data['Survived'][i]: correct += 1
  return correct / len(data)

In [34]:
testForest(data,models)

0.8013468013468014

# Evaluation

In [28]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
training = data.sample(frac=.8)
validation = data.drop(training.index)
for i in range(1,11):
  labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
  models = trainForest(training,len(labels),10,i*.1)
  print(str(i) + ': ' + str(testForest(validation,models)))

1: 0.8033707865168539
2: 0.797752808988764
3: 0.8033707865168539
4: 0.7696629213483146
5: 0.8033707865168539
6: 0.7808988764044944
7: 0.7921348314606742
8: 0.7471910112359551
9: 0.7865168539325843
10: 0.7865168539325843


In [51]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
training = data.sample(frac=.8)
validation = data.drop(training.index)
for i in range(1,30):
  labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
  models = trainForest(training,len(labels),i,)
  print(str(i) + ': ' + str(testForest(validation,models)))

1: 0.5842696629213483
2: 0.5842696629213483
3: 0.5842696629213483
4: 0.5842696629213483
5: 0.6179775280898876
6: 0.6966292134831461
7: 0.797752808988764
8: 0.8089887640449438
9: 0.8033707865168539
10: 0.8033707865168539
11: 0.797752808988764
12: 0.8089887640449438
13: 0.7865168539325843
14: 0.8089887640449438
15: 0.8089887640449438
16: 0.7640449438202247
17: 0.7752808988764045
18: 0.7640449438202247
19: 0.7640449438202247
20: 0.7808988764044944
21: 0.7752808988764045
22: 0.7696629213483146
23: 0.7584269662921348
24: 0.7696629213483146
25: 0.7584269662921348
26: 0.7696629213483146
27: 0.7584269662921348
28: 0.7640449438202247
29: 0.7808988764044944


In [52]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
training = data.sample(frac=.8)
validation = data.drop(training.index)
for i in range(1,11):
  labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
  models = trainForest(training,len(labels),10,i*.1,random=False)
  print(str(i) + ': ' + str(testForest(validation,models)))

1: 0.7921348314606742
2: 0.7865168539325843
3: 0.7921348314606742
4: 0.7921348314606742
5: 0.7808988764044944
6: 0.8033707865168539
7: 0.7865168539325843
8: 0.7921348314606742
9: 0.8033707865168539
10: 0.7921348314606742


In [56]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
training = data.sample(frac=.8)
validation = data.drop(training.index)
for i in range(1,30):
  labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
  models = trainForest(training,len(labels),i,.8,random=False)
  print(str(i) + ': ' + str(testForest(validation,models)))

1: 0.6235955056179775
2: 0.6235955056179775
3: 0.6235955056179775
4: 0.6235955056179775
5: 0.8146067415730337
6: 0.8146067415730337
7: 0.8033707865168539
8: 0.8202247191011236
9: 0.797752808988764
10: 0.7865168539325843
11: 0.7808988764044944
12: 0.7865168539325843
13: 0.7921348314606742
14: 0.8089887640449438
15: 0.7808988764044944
16: 0.7808988764044944
17: 0.7865168539325843
18: 0.7808988764044944
19: 0.7808988764044944
20: 0.7808988764044944
21: 0.7808988764044944
22: 0.7808988764044944
23: 0.7921348314606742
24: 0.7696629213483146
25: 0.7808988764044944
26: 0.7808988764044944
27: 0.7808988764044944
28: 0.7808988764044944
29: 0.7808988764044944


In [59]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
training = data.sample(frac=.8)
validation = data.drop(training.index)
labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
for i in range(len(labels)):
  labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
  models = trainForest(training,i+1,10,.8,random=False)
  print(str(i+1) + ': ' + str(testForest(validation,models)))

1: 0.8370786516853933
2: 0.8370786516853933
3: 0.8370786516853933
4: 0.8202247191011236
5: 0.8146067415730337
6: 0.8202247191011236
7: 0.8146067415730337


In [60]:
for a in range(1,100,1):
  data = transform(raw_train,age_bin=a)
  data['Survived'] = raw_train['Survived']
  training = data.sample(frac=.8)
  validation = data.drop(training.index)
  labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
  models = trainForest(training,len(labels),10,.8,random=False)
  print(str(a) + ': ' + str(testForest(validation,models)))

1: 0.7584269662921348
2: 0.7640449438202247
3: 0.7865168539325843
4: 0.7471910112359551
5: 0.7865168539325843
6: 0.7584269662921348
7: 0.7808988764044944
8: 0.7808988764044944
9: 0.7584269662921348
10: 0.8146067415730337
11: 0.797752808988764
12: 0.7247191011235955
13: 0.797752808988764
14: 0.7696629213483146
15: 0.8202247191011236
16: 0.7584269662921348
17: 0.797752808988764
18: 0.7865168539325843
19: 0.8370786516853933
20: 0.8258426966292135
21: 0.797752808988764
22: 0.7752808988764045
23: 0.7696629213483146
24: 0.7303370786516854
25: 0.7528089887640449
26: 0.7247191011235955
27: 0.8370786516853933
28: 0.8146067415730337
29: 0.8089887640449438
30: 0.7247191011235955
31: 0.8089887640449438
32: 0.7303370786516854
33: 0.8146067415730337
34: 0.8651685393258427
35: 0.8258426966292135
36: 0.7247191011235955
37: 0.8089887640449438
38: 0.7303370786516854
39: 0.7471910112359551
40: 0.8202247191011236
41: 0.7415730337078652
42: 0.7696629213483146
43: 0.7921348314606742
44: 0.7415730337078652
4

In [61]:
for f in range(5,500,5):
  data = transform(raw_train,age_bin=10,fare_bin=f)
  data['Survived'] = raw_train['Survived']
  training = data.sample(frac=.8)
  validation = data.drop(training.index)
  labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
  models = trainForest(training,len(labels),10,.8,random=False)
  print(str(f) + ': ' + str(testForest(validation,models)))

5: 0.7078651685393258
10: 0.8426966292134831
15: 0.7752808988764045
20: 0.7640449438202247
25: 0.8033707865168539
30: 0.7865168539325843
35: 0.7865168539325843
40: 0.8033707865168539
45: 0.7752808988764045
50: 0.7752808988764045
55: 0.7752808988764045
60: 0.797752808988764
65: 0.7640449438202247
70: 0.8033707865168539
75: 0.8089887640449438
80: 0.7865168539325843
85: 0.7808988764044944
90: 0.7359550561797753
95: 0.7528089887640449
100: 0.7752808988764045
105: 0.7865168539325843
110: 0.7528089887640449
115: 0.7528089887640449
120: 0.7696629213483146
125: 0.7415730337078652
130: 0.7752808988764045
135: 0.7865168539325843
140: 0.7528089887640449
145: 0.7808988764044944
150: 0.8146067415730337
155: 0.7865168539325843
160: 0.7921348314606742
165: 0.7808988764044944
170: 0.7640449438202247
175: 0.8426966292134831
180: 0.7865168539325843
185: 0.7921348314606742
190: 0.7415730337078652
195: 0.7921348314606742
200: 0.7528089887640449
205: 0.8426966292134831
210: 0.797752808988764
215: 0.7752808

# Prediction

In [46]:
def predictForest(data,models):
  out = pd.DataFrame(columns=['PassengerId','Survived'])
  out['PassengerId'] = data['PassengerId']
  for i in data.index:
    vote = 0
    for model in models:
      layer = model
      while not layer['label'] == 'End':
        value = data[layer['label']][i]
        if value in layer['next']:
          layer = layer['next'][value]
        else:
          keys = list(layer['next'].keys())
          before = keys[0]
          for k in keys:
            after = k
            if(after > value): break
            before = k
          if after - value < value - before:
            layer = layer['next'][after]
          else:
            layer = layer['next'][before]
      vote += layer['class']
    vote = 0 if vote < 5 else 1
    out.loc[i,'Survived'] = vote
  return out

In [62]:
data = transform(raw_train,age_bin=10,fare_bin=100)
data['Survived'] = raw_train['Survived']
labels = ['Pclass','Title','Sex','Age','Alone','Fare','Embarked']
models = trainForest(data,len(labels),10,.8,random=False)

In [63]:
data = transform(raw_test,age_bin=5,fare_bin=20)
data['PassengerId'] = raw_test['PassengerId']
out = predictForest(data,models)
out.to_csv('RandomForest.csv', index = False)
out

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
