# Imports

In [73]:
import pandas as pd
import numpy as np
from functools import partial

raw_train = pd.read_csv("train.csv")
raw_test = pd.read_csv("test.csv")

raw_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Exploratory Data Analysis

In [None]:
print('Pclass')
print('Range: ' + str(raw_test['Pclass'].min()) + '-' + str(raw_test['Pclass'].max()))
print('Missing Values: ' + str(raw_test['Pclass'].size - raw_test['Pclass'].count()))
print()
print('Sex')
print('Missing Values: ' + str(raw_test['Sex'].size - raw_test['Sex'].count()))
print()
print('Age')
print('Range: ' + str(raw_test['Age'].min()) + '-' + str(raw_test['Age'].max()))
print('Missing Values: ' + str(raw_test['Age'].size - raw_test['Age'].count()))
print()
print('SibSp')
print('Range: ' + str(raw_test['SibSp'].min()) + '-' + str(raw_test['SibSp'].max()))
print('Missing Values: ' + str(raw_test['SibSp'].size - raw_test['SibSp'].count()))
print()
print('Parch')
print('Range: ' + str(raw_test['Parch'].min()) + '-' + str(raw_test['Parch'].max()))
print('Missing Values: ' + str(raw_test['Parch'].size - raw_test['Parch'].count()))
print()
print('Fare')
print('Range: ' + str(raw_test['Fare'].min()) + '-' + str(raw_test['Fare'].max()))
print('Missing Values: ' + str(raw_test['Fare'].size - raw_test['Fare'].count()))
print()
print('Embarked')
print('Missing Values: ' + str(raw_test['Embarked'].size - raw_test['Embarked'].count()))

Pclass
Range: 1-3
Missing Values: 0

Sex
Missing Values: 0

Age
Range: 0.17-76.0
Missing Values: 86

SibSp
Range: 0-8
Missing Values: 0

Parch
Range: 0-9
Missing Values: 0

Fare
Range: 0.0-512.3292
Missing Values: 1

Embarked
Missing Values: 0


In [None]:
age = raw_train['Age'].floordiv(10)
age.index = age
print(age.mean())
age.groupby(level=0).count()

2.5252100840336134


Unnamed: 0_level_0,Age
Age,Unnamed: 1_level_1
0.0,62
1.0,102
2.0,220
3.0,167
4.0,89
5.0,48
6.0,19
7.0,6
8.0,1


In [None]:
fare = raw_train['Fare'].floordiv(50)
fare.index = fare
print(fare.mean())
fare.groupby(level=0).count()

0.3221099887766554


Unnamed: 0_level_0,Fare
Fare,Unnamed: 1_level_1
0.0,730
1.0,108
2.0,24
3.0,9
4.0,11
5.0,6
10.0,3


# Preprocessing

In [None]:
def transform(data,age_bin = 10,fare_bin = 100):
  titles = ["Mr", "Miss", "Mrs", "Master"]
  title = data['Name'].str.extract(r"([A-Za-z]+)\.", expand=False)
  title.replace(["Ms", "Mlle", "Mme"], "Miss", inplace=True)
  title.replace(["Lady"], "Mrs", inplace=True)
  title.replace(["Sir", "Rev"], "Mr", inplace=True)
  title[~title.isin(titles)] = "Others"
  age = data['Age'].floordiv(age_bin)
  family = data['SibSp'] + data['Parch']
  alone = family == 0
  fare = data['Fare'].floordiv(fare_bin)
  cleaned = pd.DataFrame({'Pclass':data['Pclass'],'Title':title,'Sex':data['Sex'],'Age':age,'Alone':alone,'Fare':fare,'Embarked':data['Embarked']})
  return cleaned

transform(raw_train)

Unnamed: 0,Pclass,Title,Sex,Age,Alone,Fare,Embarked
0,3,Mr,male,2.0,False,0.0,S
1,1,Mrs,female,3.0,False,0.0,C
2,3,Miss,female,2.0,True,0.0,S
3,1,Mrs,female,3.0,False,0.0,S
4,3,Mr,male,3.0,True,0.0,S
...,...,...,...,...,...,...,...
886,2,Mr,male,2.0,True,0.0,S
887,1,Miss,female,1.0,True,0.0,S
888,3,Miss,female,,False,0.0,S
889,1,Mr,male,2.0,True,0.0,C


# Training

In [None]:
def groups(data):
  features = data.columns
  groups = {}
  for feature in features:
    groups[feature] = [v for v,g in data.groupby(by=feature)]
  return groups

In [None]:
groups(transform(raw_train))

{'Pclass': [1, 2, 3],
 'Title': ['Master', 'Miss', 'Mr', 'Mrs', 'Others'],
 'Sex': ['female', 'male'],
 'Age': [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
 'Alone': [False, True],
 'Fare': [0.0, 1.0, 2.0, 5.0],
 'Embarked': ['C', 'Q', 'S']}

In [None]:
groups(transform(raw_test))

{'Pclass': [1, 2, 3],
 'Title': ['Master', 'Miss', 'Mr', 'Mrs', 'Others'],
 'Sex': ['female', 'male'],
 'Age': [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
 'Alone': [False, True],
 'Fare': [0.0, 1.0, 2.0, 5.0],
 'Embarked': ['C', 'Q', 'S']}

In [84]:
def probs(data,features):
  p = {}
  for feature in features.keys():
    column = data[feature]
    dic = {}
    max = 0
    for x in features[feature]:
      dic[x] = column[column==x].size/column.count()
      if dic[x] > max:
        max = dic[x]
    dic['max'] = max
    p[feature] = dic
  return p

In [85]:
probs(transform(raw_test),groups(transform(raw_test)))

{'Pclass': {1: np.float64(0.25598086124401914),
  2: np.float64(0.22248803827751196),
  3: np.float64(0.5215311004784688),
  'max': np.float64(0.5215311004784688)},
 'Title': {'Master': np.float64(0.050239234449760764),
  'Miss': np.float64(0.18899521531100477),
  'Mr': np.float64(0.5789473684210527),
  'Mrs': np.float64(0.1722488038277512),
  'Others': np.float64(0.009569377990430622),
  'max': np.float64(0.5789473684210527)},
 'Sex': {'female': np.float64(0.36363636363636365),
  'male': np.float64(0.6363636363636364),
  'max': np.float64(0.6363636363636364)},
 'Age': {0.0: np.float64(0.060240963855421686),
  1.0: np.float64(0.12349397590361445),
  2.0: np.float64(0.37349397590361444),
  3.0: np.float64(0.19578313253012047),
  4.0: np.float64(0.13855421686746988),
  5.0: np.float64(0.06626506024096386),
  6.0: np.float64(0.0391566265060241),
  7.0: np.float64(0.0030120481927710845),
  'max': np.float64(0.37349397590361444)},
 'Alone': {False: np.float64(0.39473684210526316),
  True: n

In [86]:
def train(data):
  d = transform(data)
  d['Survived'] = data['Survived']

  died = d[d['Survived'] == 0]
  lived = d[d['Survived'] == 1]

  features = groups(d.drop(columns=['Survived']))

  return {'died':probs(died,features),'lived':probs(lived,features),'features':list(features.keys())}

train(raw_train)

{'died': {'Pclass': {1: np.float64(0.14571948998178508),
   2: np.float64(0.1766848816029144),
   3: np.float64(0.6775956284153005),
   'max': np.float64(0.6775956284153005)},
  'Title': {'Master': np.float64(0.030965391621129327),
   'Miss': np.float64(0.10018214936247723),
   'Mr': np.float64(0.8051001821493625),
   'Mrs': np.float64(0.04735883424408015),
   'Others': np.float64(0.01639344262295082),
   'max': np.float64(0.8051001821493625)},
  'Sex': {'female': np.float64(0.14754098360655737),
   'male': np.float64(0.8524590163934426),
   'max': np.float64(0.8524590163934426)},
  'Age': {0.0: np.float64(0.05660377358490566),
   1.0: np.float64(0.14386792452830188),
   2.0: np.float64(0.33726415094339623),
   3.0: np.float64(0.22169811320754718),
   4.0: np.float64(0.12971698113207547),
   5.0: np.float64(0.0660377358490566),
   6.0: np.float64(0.030660377358490566),
   7.0: np.float64(0.014150943396226415),
   8.0: np.float64(0.0),
   'max': np.float64(0.33726415094339623)},
  'Alon

In [107]:
def helper(probs,row):
  x = 1
  for feature in probs.keys():
    if not pd.isna(row[feature]) and row[feature] in probs[feature]:
      x *= probs[feature][row[feature]]
    else:
      x *= probs[feature]['max']
  return x

def predict(data,model):
  d = transform(data)
  lived = model['lived']
  died = model['died']
  features = model['features']

  return (d.apply(partial(helper,model['lived']),axis=1) > d.apply(partial(helper,model['died']),axis=1)).map({True:1,False:0})

model = train(raw_train)
predict(raw_train,model)

Unnamed: 0,0
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,1
889,0


# Evaluation

In [112]:
def evaluate(data):
  training = data.sample(frac=.8)
  validation = data.drop(training.index)
  target = validation['Survived']

  model = train(training)
  return (predict(validation,model) - target).abs().mean()

evaluate(raw_train)

np.float64(0.16853932584269662)

In [114]:
n = 100
x = 0

for i in range(n):
  x += evaluate(raw_train)

x / n

np.float64(0.21196629213483145)

# Prediction

In [117]:
model = train(raw_train)
out = pd.DataFrame({'PassengerId':raw_test['PassengerId'],'Survived':predict(raw_test,model)})
out.to_csv('Bayes.csv', index = False)
out

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
