# Imports

In [19]:
import pandas as pd
import numpy as np

raw_train = pd.read_csv("train.csv")
raw_test = pd.read_csv("test.csv")

raw_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Exploratory Data Analysis

In [18]:
print('Pclass')
print('Range: ' + str(raw_test['Pclass'].min()) + '-' + str(raw_test['Pclass'].max()))
print('Missing Values: ' + str(raw_test['Pclass'].size - raw_test['Pclass'].count()))
print()
print('Sex')
print('Missing Values: ' + str(raw_test['Sex'].size - raw_test['Sex'].count()))
print()
print('Age')
print('Range: ' + str(raw_test['Age'].min()) + '-' + str(raw_test['Age'].max()))
print('Missing Values: ' + str(raw_test['Age'].size - raw_test['Age'].count()))
print()
print('SibSp')
print('Range: ' + str(raw_test['SibSp'].min()) + '-' + str(raw_test['SibSp'].max()))
print('Missing Values: ' + str(raw_test['SibSp'].size - raw_test['SibSp'].count()))
print()
print('Parch')
print('Range: ' + str(raw_test['Parch'].min()) + '-' + str(raw_test['Parch'].max()))
print('Missing Values: ' + str(raw_test['Parch'].size - raw_test['Parch'].count()))
print()
print('Fare')
print('Range: ' + str(raw_test['Fare'].min()) + '-' + str(raw_test['Fare'].max()))
print('Missing Values: ' + str(raw_test['Fare'].size - raw_test['Fare'].count()))
print()
print('Embarked')
print('Missing Values: ' + str(raw_test['Embarked'].size - raw_test['Embarked'].count()))

Pclass
Range: 1-3
Missing Values: 0

Sex
Missing Values: 0

Age
Range: 0.17-76.0
Missing Values: 86

SibSp
Range: 0-8
Missing Values: 0

Parch
Range: 0-9
Missing Values: 0

Fare
Range: 0.0-512.3292
Missing Values: 1

Embarked
Missing Values: 0


In [29]:
age = raw_train['Age'].floordiv(10)
age.index = age
print(age.mean())
age.groupby(level=0).count()

2.5252100840336134


Unnamed: 0_level_0,Age
Age,Unnamed: 1_level_1
0.0,62
1.0,102
2.0,220
3.0,167
4.0,89
5.0,48
6.0,19
7.0,6
8.0,1


In [54]:
fare = raw_train['Fare'].floordiv(50)
fare.index = fare
print(fare.mean())
fare.groupby(level=0).count()

0.3221099887766554


Unnamed: 0_level_0,Fare
Fare,Unnamed: 1_level_1
0.0,730
1.0,108
2.0,24
3.0,9
4.0,11
5.0,6
10.0,3


# Preprocessing

In [131]:
def transform(data,age_bin = 10,fare_bin = 50):
  sex = data['Sex'].map({'male':0,'female':1,np.nan:1})
  age = data['Age'].floordiv(age_bin,fill_value=data['Age'].mean())
  fare = data['Fare'].floordiv(fare_bin,fill_value=0)
  embarked = data['Embarked'].map({'S':0,'C':1,'Q':2})
  cleaned = pd.DataFrame({'Pclass':data['Pclass'],'Sex':sex,'Age':age,'SibSp':data['SibSp'],'Parch':data['Parch'],'Fare':fare,'Embarked':embarked})
  return cleaned

transform(raw_train)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,2.0,1,0,0.0,0.0
1,1,1,3.0,1,0,1.0,1.0
2,3,1,2.0,0,0,0.0,0.0
3,1,1,3.0,1,0,1.0,0.0
4,3,0,3.0,0,0,0.0,0.0
...,...,...,...,...,...,...,...
886,2,0,2.0,0,0,0.0,0.0
887,1,1,1.0,0,0,0.0,0.0
888,3,1,2.0,1,2,0.0,0.0
889,1,0,2.0,0,0,0.0,1.0


# Training

In [71]:
def entropy(data):
  p1 = data['Survived'].mean();
  p0 = 1 - p1
  if p0 == 0 or p1 == 0: return 0
  return -1 * (p0*np.log2(p0) + p1*np.log2(p1))

entropy(raw_train)

np.float64(0.9607079018756469)

In [174]:
def train(data,labels,max_depth,min_improv=0):
  if(max_depth > 0):
    H = entropy(data)
    if(H > 0):
      min = 2
      for l in labels:
        groupby = data.groupby(by=l)
        e = 0
        for value, group in groupby:
          e += entropy(group) * len(group)/len(data)
        if e < min:
          min = e
          label = l
          g = groupby
      if min < H - min_improv:
        labels.remove(label)
        next = dict();
        for value, group in g:
          next.update({value:train(group,labels,max_depth-1)})
        return {'label':label,'entropy':H,'next':next}
  return {'label':'End','entropy':entropy(data),'class': 0 if data['Survived'].mean() < .5 else 1}

In [175]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
labels = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
model = train(data,labels,len(labels))
model

{'label': 'Sex',
 'entropy': np.float64(0.9607079018756469),
 'next': {0: {'label': 'Age',
   'entropy': np.float64(0.6991817891208407),
   'next': {0.0: {'label': 'SibSp',
     'entropy': np.float64(0.9744894033980523),
     'next': {0: {'label': 'End', 'entropy': 0, 'class': 1},
      1: {'label': 'End', 'entropy': 0, 'class': 1},
      2: {'label': 'End', 'entropy': 0, 'class': 1},
      3: {'label': 'End', 'entropy': 0, 'class': 0},
      4: {'label': 'Parch',
       'entropy': np.float64(0.4689955935892812),
       'next': {1: {'label': 'End', 'entropy': 0, 'class': 0},
        2: {'label': 'End',
         'entropy': np.float64(0.9182958340544896),
         'class': 0}}},
      5: {'label': 'End', 'entropy': 0, 'class': 0}}},
    1.0: {'label': 'Fare',
     'entropy': np.float64(0.5373760853377334),
     'next': {0.0: {'label': 'Embarked',
       'entropy': np.float64(0.46274905857817394),
       'next': {0.0: {'label': 'Pclass',
         'entropy': np.float64(0.4262286569981447),

# Testing

In [176]:
def test(data,model):
  correct = 0
  for i in data.index:
    layer = model
    while not layer['label'] == 'End':
      value = data[layer['label']][i]
      if value in layer['next']:
        layer = layer['next'][value]
      else:
        keys = list(layer['next'].keys())
        before = keys[0]
        for k in keys:
          after = k
          if(after > value): break
          before = k
        if after - value < value - before:
          layer = layer['next'][after]
        else:
          layer = layer['next'][before]
    if layer['class'] == data['Survived'][i]: correct += 1
  return correct / len(data)

In [177]:
test(data,model)

0.8092031425364759

# Evaluation

In [178]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
training = data.sample(frac=.8)
validation = data.drop(training.index)
labels = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
for i in range(len(labels)):
  labels = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
  model = train(training,labels,i)
  print(str(i) + ': ' + str(test(validation,model)))

0: 0.6460674157303371
1: 0.7415730337078652
2: 0.7415730337078652
3: 0.7528089887640449
4: 0.7584269662921348
5: 0.7584269662921348
6: 0.7584269662921348


In [179]:
for a in range(1,100,1):
  data = transform(raw_train,age_bin=a)
  data['Survived'] = raw_train['Survived']
  training = data.sample(frac=.8)
  validation = data.drop(training.index)
  labels = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
  model = train(training,labels,len(labels))
  print(str(a) + ': ' + str(test(validation,model)))

1: 0.7471910112359551
2: 0.797752808988764
3: 0.8089887640449438
4: 0.7528089887640449
5: 0.8146067415730337
6: 0.7808988764044944
7: 0.7696629213483146
8: 0.7752808988764045
9: 0.797752808988764
10: 0.848314606741573
11: 0.7865168539325843
12: 0.8089887640449438
13: 0.8089887640449438
14: 0.8426966292134831
15: 0.8202247191011236
16: 0.8370786516853933
17: 0.7696629213483146
18: 0.797752808988764
19: 0.7415730337078652
20: 0.8033707865168539
21: 0.7865168539325843
22: 0.7528089887640449
23: 0.7528089887640449
24: 0.8202247191011236
25: 0.7752808988764045
26: 0.8202247191011236
27: 0.7528089887640449
28: 0.7471910112359551
29: 0.8033707865168539
30: 0.7808988764044944
31: 0.7528089887640449
32: 0.7752808988764045
33: 0.7584269662921348
34: 0.8033707865168539
35: 0.7471910112359551
36: 0.7921348314606742
37: 0.7696629213483146
38: 0.7808988764044944
39: 0.8146067415730337
40: 0.7921348314606742
41: 0.8314606741573034
42: 0.7471910112359551
43: 0.7808988764044944
44: 0.7696629213483146
4

In [149]:
for f in range(5,500,5):
  data = transform(raw_train,fare_bin=f)
  data['Survived'] = raw_train['Survived']
  training = data.sample(frac=.8)
  validation = data.drop(training.index)
  labels = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
  model = train(training,labels,len(labels))
  print(str(f) + ': ' + str(test(validation,model)))

5: 0.7359550561797753
10: 0.7528089887640449
15: 0.7752808988764045
20: 0.8033707865168539
25: 0.8202247191011236
30: 0.8258426966292135
35: 0.797752808988764
40: 0.7865168539325843
45: 0.7471910112359551
50: 0.8146067415730337
55: 0.8202247191011236
60: 0.7921348314606742
65: 0.8426966292134831
70: 0.7865168539325843
75: 0.7696629213483146
80: 0.848314606741573
85: 0.7528089887640449
90: 0.8089887640449438
95: 0.8539325842696629
100: 0.8370786516853933
105: 0.7752808988764045
110: 0.8089887640449438
115: 0.7359550561797753
120: 0.8314606741573034
125: 0.7696629213483146
130: 0.8146067415730337
135: 0.8314606741573034
140: 0.7865168539325843
145: 0.7640449438202247
150: 0.8089887640449438
155: 0.7921348314606742
160: 0.8202247191011236
165: 0.8202247191011236
170: 0.7921348314606742
175: 0.8146067415730337
180: 0.7303370786516854
185: 0.7808988764044944
190: 0.7528089887640449
195: 0.8089887640449438
200: 0.7921348314606742
205: 0.8370786516853933
210: 0.7640449438202247
215: 0.8033707

In [172]:
data = transform(raw_train)
data['Survived'] = raw_train['Survived']
training = data.sample(frac=.8)
validation = data.drop(training.index)
labels = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
for i in range(30):
  labels = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
  model = train(training,labels,len(labels),min_improv=i/100)
  print(str(i) + ': ' + str(test(validation,model)))

0: 0.7640449438202247
1: 0.7640449438202247
2: 0.7640449438202247
3: 0.7640449438202247
4: 0.7640449438202247
5: 0.7640449438202247
6: 0.7640449438202247
7: 0.7640449438202247
8: 0.7640449438202247
9: 0.7640449438202247
10: 0.7640449438202247
11: 0.7640449438202247
12: 0.7640449438202247
13: 0.7640449438202247
14: 0.7640449438202247
15: 0.7640449438202247
16: 0.7640449438202247
17: 0.7640449438202247
18: 0.7640449438202247
19: 0.7640449438202247
20: 0.7640449438202247
21: 0.7640449438202247
22: 0.7640449438202247
23: 0.5955056179775281
24: 0.5955056179775281
25: 0.5955056179775281
26: 0.5955056179775281
27: 0.5955056179775281
28: 0.5955056179775281
29: 0.5955056179775281


# Prediction

In [158]:
def predict(data,model):
  out = pd.DataFrame(columns=['PassengerId','Survived'])
  out['PassengerId'] = data['PassengerId']
  for i in data.index:
    layer = model
    while not layer['label'] == 'End':
      value = data[layer['label']][i]
      if value in layer['next']:
        layer = layer['next'][value]
      else:
        keys = list(layer['next'].keys())
        before = keys[0]
        for k in keys:
          after = k
          if(after > value): break
          before = k
        if after - value < value - before:
          layer = layer['next'][after]
        else:
          layer = layer['next'][before]
    out.loc[i,'Survived'] = layer['class']
  return out

In [168]:
data = transform(raw_train,age_bin=5,fare_bin=20)
data['Survived'] = raw_train['Survived']
training = data.sample(frac=.8)
validation = data.drop(training.index)
labels = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
model = train(training,labels,len(labels))
test(validation,model)

0.8426966292134831

In [169]:
data = transform(raw_test,age_bin=5,fare_bin=20)
data['PassengerId'] = raw_test['PassengerId']
out = predict(data,model)
out.to_csv('DecisionTree.csv', index = False)
out

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
