In [39]:
#importing required libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile as zip
import math
plt.rcParams['figure.figsize'] = (20.0, 10.0)

In [40]:
#read data
def loaddata(filename, key):
    zf = zip.ZipFile(filename) 
    traindata = pd.read_csv(zf.open('train.csv'))
    testdata = pd.read_csv(zf.open('test.csv'))
    if key == 0:
        return traindata
    else :
        return testdata

In [41]:
def preprocess(data):
    
    #drop unnecessary fields
    data = data.drop(['Name','Cabin','Ticket','SibSp','Parch'], axis=1)
    
    #convert the column of Sex into numeric value
    for i in range(data['Sex'].size):
        if data['Sex'][i] == 'female':
            data.loc[i, 'Sex'] = 0
        else:
            data.loc[i, 'Sex'] = 1
    data['Sex'] = pd.to_numeric(data['Sex'])
    
    #convert the column of Sex into numeric value
    for i in range(data['Embarked'].size):
        if data['Embarked'][i] == 'C':
            data.loc[i, 'Embarked'] = 0
        elif data['Embarked'][i] == 'Q':
            data.loc[i, 'Embarked'] = 1
        else:
            data.loc[i, 'Embarked'] = 2
    data['Embarked'] = pd.to_numeric(data['Embarked'])
    
    #remove null values
    data = data.dropna()
    return data

In [42]:
def accuracy(yp,y_test):
    err = 0
    for i in range(len(yp)):
        if yp[i][1] == y_test[i][1]:
            err += 0
        else:
            err += 1
    n = len(yp)
    acc = 1 - (err/n)
    return acc

In [51]:
def naivebayes(x,y):
    n = len(y)
    
    #splitting into test and train
    m = 2*n//3
    x_train = x.iloc[:m]
    x_test = x.iloc[m:]
    y_train = y.iloc[:m]
    y_test = y.iloc[m:]
    
    prob_s = 0
    prob_sm = 0
    prob_sf = 0
    prob_sp1 = 0
    prob_sp2 = 0
    prob_sp3 = 0
    prob_se0 = 0
    prob_se1 = 0
    prob_se2 = 0
    for i in x_train.index:
        if y_train['Survived'][i] == 1 :
            prob_s +=1
            if x_train['Sex'][i] == 0:
                prob_sf +=1
            elif x_train['Sex'][i] == 1:
                prob_sm +=1
            elif x_train['Pclass'][i] == 1:
                prob_sp1 +=1
            elif x_train['Pclass'][i] == 2:
                prob_sp2 +=1
            elif x_train['Pclass'][i] == 3:
                prob_sp3 +=1
            elif x_train['Embarked'][i] == 0:
                prob_se0 +=1
            elif x_train['Embarked'][i] == 1:
                prob_se1 +=1
            elif x_train['Embarked'][i] == 2:
                prob_se2 +=1
    n = len(x_train['Sex'])
    prob_s = prob_s/n
    prob_sp1 = prob_sp1/n
    prob_sp2 = prob_sp2/n
    prob_sp3 = prob_sp3/n
    prob_se0 = prob_se0/n
    prob_se1 = prob_se1/n
    prob_se2 = prob_se2/n
    prob_sf = prob_sf/n
    prob_sm = prob_sm/n
    
    
    prob_m = 0
    prob_f = 0
    prob_p1 = 0
    prob_p2 = 0
    prob_p3 = 0
    prob_e0 = 0
    prob_e1 = 0
    prob_e2 = 0
    for i in x_train.index:
        if x_train['Sex'][i] == 0:
            prob_f +=1
        elif x_train['Sex'][i] == 1:
            prob_m +=1
        elif x_train['Pclass'][i] == 1:
            prob_p1 +=1
        elif x_train['Pclass'][i] == 2:
            prob_p2 +=1
        elif x_train['Pclass'][i] == 3:
            prob_p3 +=1
        elif x_train['Embarked'][i] == 0:
            prob_e0 +=1
        elif x_train['Embarked'][i] == 1:
            prob_e1 +=1
        elif x_train['Embarked'][i] == 2:
            prob_e2 +=1
    prob_p1 = prob_p1/n
    prob_p2 = prob_p2/n
    prob_p3 = prob_p3/n
    prob_e0 = prob_e0/n
    prob_e1 = prob_e1/n
    prob_e2 = prob_e2/n
    prob_f = prob_f/n
    prob_m = prob_m/n
    
    yp = []
    p = 0
    ps = 0
    pe = 0
    pc = 0
    for i in x_test.index:
        temp = []
        temp.append(x_test['PassengerId'][i])
        if x_test['Sex'][i] == 0:
            ps = (prob_f*prob_sf)/prob_s
        elif x_test['Sex'][i] == 1:
            ps = (prob_m*prob_sm)/prob_s
        elif x_test['Pclass'][i] == 1:
            pc = (prob_p1*prob_sp1)/prob_s
        elif x_test['Pclass'][i] == 2:
            pc = (prob_p2*prob_sp2)/prob_s
        elif x_test['Pclass'][i] == 3:
            pc = (prob_p3*prob_sp3)/prob_s
        elif x_test['Embarked'][i] == 0:
            pe = (prob_e0*prob_se0)/prob_s
        elif x_test['Embarked'][i] == 1:
            pe = (prob_e1*prob_se1)/prob_s
        elif x_test['Embarked'][i] == 2:
            pe = (prob_e2*prob_se2)/prob_s
        p = ps*pc*pe
        if p>0.5:
            temp.append(1)
        else:
            temp.append(0)
        yp.append(temp)
    
    y_test = y_test.values.tolist()
    acc = accuracy(yp,y_test)
    print("Accuracy:",acc)
    #print(yp)

In [52]:
def main():
    filename = 'titanic.zip'
    data = loaddata(filename,0)
    traindata = preprocess(data)
    x = traindata.drop(['Survived','Fare','Age'], axis=1)
    y = traindata.drop(['Pclass','Sex','Age','Fare','Embarked'], axis=1)
    n = len(x)//2
    naivebayes(x,y)

In [53]:
main()

Accuracy: 0.6092436974789917
