In [222]:
#importing required libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile as zip
import math
plt.rcParams['figure.figsize'] = (20.0, 10.0)

In [223]:
#read data
def loaddata(filename, key):
    zf = zip.ZipFile(filename) 
    traindata = pd.read_csv(zf.open('train.csv'))
    testdata = pd.read_csv(zf.open('test.csv'))
    if key == 0:
        return traindata
    else :
        return testdata

In [224]:
def preprocess(data):
    
    #drop unnecessary fields
    data = data.drop(['Name','Cabin','Ticket','SibSp','Parch'], axis=1)
    
    #convert the column of Sex into numeric value
    for i in range(data['Sex'].size):
        if data['Sex'][i] == 'female':
            data.loc[i, 'Sex'] = 0
        else:
            data.loc[i, 'Sex'] = 1
    data['Sex'] = pd.to_numeric(data['Sex'])
    
    #remove null values
    data = data.dropna()
    return data

In [225]:
def dis(x_train,x_test,k) :
    fin = []
    t = []
    for j in x_test.index:
        d = []
        for i in x_train.index:
            temp = []
            dist = math.sqrt((x_train['Sex'][i]-x_test['Sex'][j])**2 + (x_train['Age'][i]-x_test['Age'][j])**2 + (x_train['Fare'][i]-x_test['Fare'][j])**2)
            temp.append(dist)
            temp.append(x_train['PassengerId'][i])
            temp.append(x_test['PassengerId'][j])
            d.append(temp)
        d.sort()
        fin.append(d[:k])
    return fin

In [226]:
def predict(dist,c1,c2):
    yp = []
    for i in dist:
        count1 = 0
        count2 = 0
        temp = []
        t = 0
        for j in i:
            t = j[2]
            if j[1] in c1:
                count1 += 1
            else:
                count2 += 1
        if count1>count2:
            temp.append(t)
            temp.append(0)
            yp.append(temp)
        else:
            temp.append(t)
            temp.append(1)
            yp.append(temp)
    return yp

In [227]:
def accuracy(yp,y_test):
    err = 0
    for i in range(len(yp)):
        if yp[i][1] == y_test[i][1]:
            err += 0
        else:
            err += 1
    n = len(yp)
    acc = 1 - (err/n)
    return acc

In [228]:
def knn(x,y):
    n = len(y)
    
    #splitting into test and train
    m = 2*n//3
    x_train = x.iloc[:m]
    x_test = x.iloc[m:]
    y_train = y.iloc[:m]
    y_test = y.iloc[m:]
    
    #creating seperate classes
    c1 = []
    c2 = []
    for i in x_train.index:
        if y_train['Survived'][i] == 0 :
            c1.append(y_train['PassengerId'][i])
        else:
            c2.append(y_train['PassengerId'][i])
            
    y_test = y_test.values.tolist()
    k = 2
    kopt = 2
    maxac = 0
    for k in range(20):
        dist = dis(x_train,x_test,k)
        yp = predict(dist,c1,c2)
        acc = accuracy(yp,y_test)    
        if acc> maxac:
            kopt = k
            maxac = acc
    print("Optimal value of k is :",k)
    print("Efficiency:",maxac)

In [229]:
def main():
    filename = 'titanic.zip'
    data = loaddata(filename,0)
    traindata = preprocess(data)
    x = traindata.drop(['Survived'], axis=1)
    y = traindata.drop(['Pclass','Sex','Age','Fare','Embarked'], axis=1)
    n = len(x)//2
    knn(x,y)

In [230]:
main()

Optimal value of k is : 19
Efficiency: 0.7436974789915967
