In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import random

In [2]:
!kaggle competitions download -c titanic

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
import zipfile

zp = zipfile.ZipFile('titanic.zip') 
df = pd.read_csv(zp.open('train.csv'))

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df = df.drop(['PassengerId', 'Name', 'Parch', 'Ticket', 'Cabin'], axis = 1)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


In [7]:
df['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [8]:
df.loc[df['SibSp'] >= 1, 'SibSp'] = 1

In [9]:
df['SibSp'].value_counts()

0    608
1    283
Name: SibSp, dtype: int64

In [10]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [11]:
df['Embarked'] = df['Embarked'].fillna('S')

In [12]:
df = df.dropna(subset = ['Age'])
df['Age'] = df['Age'].astype(int)

df['Age'] = pd.cut(df['Age'], 4)
df['Fare'] = pd.qcut(df['Fare'], 5)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Embarked
0,0,3,male,"(20.0, 40.0]",1,"(-0.001, 7.913]",S
1,1,1,female,"(20.0, 40.0]",1,"(46.9, 512.329]",C
2,1,3,female,"(20.0, 40.0]",0,"(7.913, 13.0]",S
3,1,1,female,"(20.0, 40.0]",1,"(46.9, 512.329]",S
4,0,3,male,"(20.0, 40.0]",0,"(7.913, 13.0]",S


In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for column in ['Sex', 'Age', 'Fare', 'Embarked']:
    le.fit(df[column])
    df[column] = le.transform(df[column])

In [14]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Embarked
0,0,3,1,1,1,0,2
1,1,1,0,1,1,4,0
2,1,3,0,1,0,1,2
3,1,1,0,1,1,4,2
4,0,3,1,1,0,1,2


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis = 1), df['Survived'], test_size=0.2)
y_train

769    0
777    1
833    0
599    1
362    0
      ..
505    0
608    1
822    0
305    1
80     0
Name: Survived, Length: 571, dtype: int64

In [16]:
class NBC:
    def __init__(self):
        pass
    
    def fit(self, X_train, y_train):
        self.X = X_train
        self.y = y_train
        
        self.columns = []
        self.members_of_col = []
        
        for col in self.X:
            self.columns.append(col)
            self.members_of_col.append(sorted(self.X[col].unique()))
        
        self.prob_0 = []
        self.prob_1 = []
        for i in range(len(self.columns)):
            prob_0_i, prob_1_i = [], []
            for j in range(len(self.members_of_col[i])):
                prob_0_i.append(len(self.X[self.X[self.columns[i]] == self.members_of_col[i][j]][self.y == 0]) 
                              / len(self.X[self.y == 0]))
                prob_1_i.append(len(self.X[self.X[self.columns[i]] == self.members_of_col[i][j]][self.y == 1]) 
                              / len(self.X[self.y == 1]))
            self.prob_0.append(prob_0_i)
            self.prob_1.append(prob_1_i)
        
    def predict(self, X_test):
        y_pred = []
        
        for i in range(len(X_test)):
            y_pred_i = [len(self.X[self.y == 0]) / len(self.X), len(self.X[self.y == 1]) / len(self.X)]
            
            for j in range(len(self.columns)):
                col_i_j = X_test.iloc[i][self.columns[j]]
                ind = self.members_of_col[j].index(col_i_j)
                
                y_pred_i[0] *= self.prob_0[j][ind]
                y_pred_i[1] *= self.prob_1[j][ind]
            
            if y_pred_i[0] <=  y_pred_i[1]:
                y_pred += [1]
            else:
                y_pred += [0]
        
        return y_pred

In [17]:
clf = NBC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)



In [18]:
import sklearn.naive_bayes

sklearn.naive_bayes.__all__

['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB']

In [19]:
print('\t\t all\t\t\t survived = 0\t\t survived = 1')
print('my NB\t\t', len(y_test[y_pred == y_test]) / len(y_test), 
          '\t', len(y_test[y_pred == y_test][y_test == 0]) / len(y_test[y_test == 0]), 
          '\t', len(y_test[y_pred == y_test][y_test == 1]) / len(y_test[y_test == 1]))
print()

from sklearn.naive_bayes import *
for NB in [BernoulliNB(), GaussianNB(), MultinomialNB(), ComplementNB()]:
    clf2 = NB
    clf2.fit(X_train, y_train)
    y_pred2 = clf2.predict(X_test)
    print(NB.__class__.__name__, '\t', len(y_test[y_pred2 == y_test]) / len(y_test), 
          '\t', len(y_test[y_pred2 == y_test][y_test == 0]) / len(y_test[y_test == 0]), 
          '\t', len(y_test[y_pred2 == y_test][y_test == 1]) / len(y_test[y_test == 1]))

		 all			 survived = 0		 survived = 1
my NB		 0.8181818181818182 	 0.8918918918918919 	 0.7391304347826086

BernoulliNB 	 0.8321678321678322 	 0.9459459459459459 	 0.7101449275362319
GaussianNB 	 0.8251748251748252 	 0.9054054054054054 	 0.7391304347826086
MultinomialNB 	 0.7272727272727273 	 0.8918918918918919 	 0.5507246376811594
ComplementNB 	 0.7342657342657343 	 0.7702702702702703 	 0.6956521739130435
