# Classification - KNN

Dataset - The Titanic dataset: <br/>
<img src="images/ships-titanic-vehicles-best.jpg" Width="800"/>

In [12]:
# import file section:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math 
from random import randrange

In [13]:
#The xInstances contains all columns, besides the 'Survived' column, which is the category column
# ----- the xInstances is a dataframe, with the 'PassengerId' column as an index.
# ----- the xInstances dataframe contains the instance vectors (each row is an instance with values of the features) 
# The yCategories contains only two columns: 'PassengerId','Survived' (which is a column of the categories) 
# ----- the yCategories is a dataframe, with the 'PassengerId' column as an index.
# ----- the yCategories dataframe contains only the category column ('Survived'), besides the index 
# ----- (each value in the 'Survived' column corresponds to the feature vector in xInstances with the same 'PassengerId') 
# --------------------- 
def loadCsvFile(fileName):
    df_titanic=pd.read_csv(fileName)
    xInstances=pd.DataFrame(df_titanic.iloc[:,0:10])
    xInstances.set_index('PassengerId',inplace=True)
    yCategories=pd.DataFrame(df_titanic,columns=['PassengerId','Survived'])
    yCategories.set_index('PassengerId',inplace=True)
    return xInstances,yCategories
  

In [14]:
xTitanicInstances, yTitanicCategories = loadCsvFile('titanic_dataset_fix.csv')
# validating that the data is loaded successfully:
assert xTitanicInstances is not None, 'titanic instance dataframe object is empty'
print ( 'titanic instance dataframe object is NOT empty')
assert 'Survived' not in xTitanicInstances.columns, 'instances should not include the category column'
assert np.array_equal(xTitanicInstances['Age'].head().tolist()[1:3],[38, 26]), 'titanic instance dataframe object was NOT loaded successfully'
print ("\t titanic instance dataframe object was loaded successfuly")
assert yTitanicCategories is not None, 'titanic category dataframe object is empty'
assert 'Survived' in yTitanicCategories.columns, 'category dataframe is missing the category column'
print ( 'titanic category dataframe object is NOT empty')
assert np.array_equal(yTitanicCategories['Survived'].head().tolist()[:2],[0, 1]), 'titanic category dataframe object was NOT loaded successfully'
print ("\t titanic category dataframe object was loaded successfuly")
print ('------------------------\n')

  
print ('instance (=feature vector) datafame (displaying the "age","gender" features):')
print (xTitanicInstances[['Age','Gender']].head())
print ('------------------------\n')
print ('class (=category) datafame:')
print (yTitanicCategories.head())
print ('------------------------\n')


titanic instance dataframe object is NOT empty
	 titanic instance dataframe object was loaded successfuly
titanic category dataframe object is NOT empty
	 titanic category dataframe object was loaded successfuly
------------------------

instance (=feature vector) datafame (displaying the "age","gender" features):
              Age  Gender
PassengerId              
1            22.0    male
2            38.0  female
3            26.0  female
4            35.0  female
5            35.0    male
------------------------

class (=category) datafame:
             Survived
PassengerId          
1                   0
2                   1
3                   1
4                   1
5                   0
------------------------



In [15]:
# input parameter: xInstances - the instance dataframe (=feature vectors) without the categories  
# input parameter: yCategories - the category (=class/tag) dataframe
# input parameter: testRatio - a 0<number<1, which represent the ratio of the dataset which will be used by the test set
# --- as explained, frequently used values are: 0.1, 0.2 or 0.3
# output: xTrainInstances, xTestInstances, yTrainCategories, yTestCategories

def trainTestSplit(xInstances,yCategories,testRatio):
    testCountX=int(xInstances.shape[0]*testRatio)
    trainCountX=int(xInstances.shape[0]*(1-testRatio))
    isUsed=np.zeros(yCategories.shape[0])
    found=0
    lsta=[]
    lstb=[]
    index=randrange(yCategories.shape[0])
    for x in range(testCountX):
        while found==0:
            if isUsed[index]==0:
                isUsed[index]+=1
                found=1
                lsta.append(index)
            else:
                index=randrange(yCategories.shape[0])
        found=0
    for y in range(trainCountX):
        while found==0:
            if isUsed[index]==0:
                isUsed[index]+=1
                found=1
                lstb.append(index)
            else:
                index=randrange(yCategories.shape[0])
        found=0
    xTrainInstances=xInstances.iloc[lstb]
    yTrainCategories=yCategories.iloc[lstb]
    xTestInstances=xInstances.iloc[lsta]
    yTestCategories=yCategories.iloc[lsta]
    return (xTrainInstances,xTestInstances,yTrainCategories,yTestCategories)
    

In [16]:
xTitanicInstances10 = xTitanicInstances.iloc[:10]
yTitanicCategories10= yTitanicCategories.iloc[:10]

from random import seed
seed(1)
splitSize=0.3
xTrain,xTest,yTrain,yTest = trainTestSplit(xTitanicInstances10,yTitanicCategories10,splitSize)
assert xTest is not None, 'xTest should not be None'
assert len(xTest)==len(xTitanicInstances10)*splitSize,'wrong split size'
assert yTest is not None, 'yTest should not be None'
assert len(yTest)==len(xTitanicInstances10)*splitSize,'wrong split size'
assert xTrain is not None, 'xTrain should not be None'
assert len(xTrain)==len(xTitanicInstances10)*(1-splitSize),'wrong split size'
assert yTrain is not None, 'yTrain should not be None'
assert len(yTrain)==len(xTitanicInstances10)*(1-splitSize),'wrong split size'

print(xTrain[['Age','Gender']])
print('-----------------')
print(xTest[['Age','Gender']])
print('-----------------')



                   Age  Gender
PassengerId                   
5            35.000000    male
8             2.000000    male
7            54.000000    male
4            35.000000  female
1            22.000000    male
6            41.560785    male
9            27.000000  female
-----------------
              Age  Gender
PassengerId              
3            26.0  female
10           14.0  female
2            38.0  female
-----------------


In [17]:
# xSeriesTestVector and xSeriesTrainVector are actually Series objects
# ---- the xSeriesTestVector and xSeriesTrainVector consist of a single row (in the from xTrain and xTest correspondigly) 
# The xInstances contains all columns, besides the 'Survived' column, which is the category column

def euclideanDist(xSeriesTestVector,xSeriesTrainVector):
    distance = 0.0
    for i in range(len(xSeriesTrainVector)):
        distance += ((xSeriesTestVector.iloc[i] - xSeriesTrainVector.iloc[i])**2)
    return np.sqrt(distance)


In [18]:
inst0 = xTitanicInstances[['Pclass','SibSp','Parch']].iloc[0]
inst7 = xTitanicInstances[['Pclass','SibSp','Parch']].iloc[7]
print (inst0.values)
print (inst7.values)
dist = euclideanDist(inst0,inst7)
assert float(int(dist*100))/100 == 2.23, 'unexpected euclidean distance'
print ('Euclidean Distance calculated successfuly: %f' %(dist))

[3 1 0]
[3 3 1]
Euclidean Distance calculated successfuly: 2.236068


In [19]:
# Change the following function to return the Manhattan Distastance between two vectors from the test and train
# xSeriesTestVector and xSeriesTrainVector are actually Series objects
# ---- the xSeriesTestVector and xSeriesTrainVector consist of a single row (in the from xTrain and xTest correspondigly) 
# The xInstances contains all columns, besides the 'Survived' column, which is the category column

def manhattanDist(xSeriesTestVector,xSeriesTrainVector): 
    sum1=xSeriesTestVector-xSeriesTrainVector
    res=0
    for i in sum1:
        res+=abs(i)
    return res


In [20]:
inst0 = xTitanicInstances[['Pclass','SibSp','Parch']].iloc[0]
inst7 = xTitanicInstances[['Pclass','SibSp','Parch']].iloc[7]
print (inst0.values)
print (inst7.values)
dist = manhattanDist(inst0,inst7)
assert dist== 3, 'unexpected manhattan distance'
print ('Manhattan Distance calculated successfuly: %d' %(dist))

[3 1 0]
[3 3 1]
Manhattan Distance calculated successfuly: 3


In [21]:
# input parameter - xSeriesTestVector - is a Series object, from the test set
# input parameter - xTrainInstances - is a dataframe including all train instances
# input parameter - distanceMetric - the name of the distance function, not as a string
# ---- distanceMetric - options are: euclideanDist ; manhattanDist
# The retuned value is a numpy array, containing distances between the test instance
#                 and all train instances (ordered by the train instance order) 

def calcDistances(xSeriesTestVector, xTrainInstances,distanceMetric):
    res=np.zeros(shape=(len(xTrainInstances)))
    counter=0
    for row in range(len(xTrainInstances)):        
        toSend=xTrainInstances.iloc[row]
        res[row]=distanceMetric(xSeriesTestVector,toSend)
    return res




# input parameter - xSeriesTestVector - is a Series object, from the test set
# input parameter - xTrainInstances - a dataframe including all train instances
# input parameter - yTrainCategories - a dataframe including all train categories 
# input parameter - distanceMetric - the name of the distance function, not as a string
# ---- distanceMetric - options are: euclideanDist ; manhattanDist
# input parameter - k - the number of Nearest Neighbors (we select the majority out of k votes)
# The retuned value is a numpy array, containing distances between the test instance
#                 and all train instances (ordered by the train instance order) 

def predict(xSeriesTestVector, xTrainInstances,yTrainCategories,distanceMetric,k):
    distances = calcDistances(xSeriesTestVector, xTrainInstances,distanceMetric)
    toCheck=np.argsort(distances)
    checking=np.empty(k)
    trueCounter=0
    falseCounter=0
    Count=np.zeros(abs(len(xSeriesTestVector)-1))
    for i in range(k):
        if yTrainCategories.iloc[toCheck[i],-1]==1:
            trueCounter+=1
        else:
            falseCounter+=1
    if trueCounter>=falseCounter:
        category=1
    else:
        category=0
    return category

In [22]:
# Feature Scaling
## We will be using standardscaler to transform
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
xTitanicInstances_5Features = xTitanicInstances[['Pclass','SibSp','Parch','Fare','Age']]
xTitanicInstances_5Features_40 = xTitanicInstances_5Features.iloc[:40]
yTitanicCategories_40 = yTitanicCategories.iloc[:40]
xTitanicInstances_5Features_TestInst = xTitanicInstances_5Features.iloc[-7:-6]
## transforming "train_x"
xTrainScaled = pd.DataFrame(sc.fit_transform(xTitanicInstances_5Features_40),index=xTitanicInstances_5Features_40.index)

## transforming "test_x"
xTestScaled = pd.Series(sc.transform(xTitanicInstances_5Features_TestInst)[0],index=xTitanicInstances_5Features_40.columns)

# measure distance without scaling:
manhattanDist_k3 = predict(xTitanicInstances_5Features_TestInst.iloc[-1],xTitanicInstances_5Features_40,yTitanicCategories_40,manhattanDist,3)
euclideanDist_k3 = predict(xTitanicInstances_5Features_TestInst.iloc[-1],xTitanicInstances_5Features_40,yTitanicCategories_40,euclideanDist,3)
assert manhattanDist_k3 == 0,"wrong value for knn, with k=3, dist='manhattan'"
assert euclideanDist_k3 == 1,"wrong value for knn, with k=3, dist='euclidean'"
print ('manhattanDist,k=3, inst=-%d, chosen cat: %d' %(7,manhattanDist_k3))
print ('euclideanDist,k=3, inst=-%d, chosen cat: %d' %(7,euclideanDist_k3))
manhattanDist_scaled_k3 = predict(xTestScaled,xTrainScaled,yTitanicCategories_40,manhattanDist,3)
euclideanDist_scaled_k3 = predict(xTestScaled,xTrainScaled,yTitanicCategories_40,euclideanDist,3)
# measure distance with scaling:
assert manhattanDist_scaled_k3 == 1,"wrong value for (scaled) knn, with k=3, dist='manhattan'"
assert euclideanDist_scaled_k3 == 1,"wrong value for (scaled) knn, with k=3, dist='euclidean'"
print ('(scaled) manhattanDist,k=3, inst=-%d, chosen cat: %d' %(7,euclideanDist_scaled_k3))
print ('(scaled) euclideanDist,k=3, inst=-%d, chosen cat: %d' %(7,manhattanDist_scaled_k3))
print ('---------------------------------------------------')



manhattanDist,k=3, inst=-7, chosen cat: 0
euclideanDist,k=3, inst=-7, chosen cat: 1
(scaled) manhattanDist,k=3, inst=-7, chosen cat: 1
(scaled) euclideanDist,k=3, inst=-7, chosen cat: 1
---------------------------------------------------


In [23]:
def evaluate_accuracy(xTestInstances,xTrainInstances,yTrainCategories,yTestCategories,distanceMetric,k):
    trueCounter=0
    counter=0
    x1=pd.Series()
    for x in range(len(xTestInstances)):
        x1=xTestInstances.iloc[x]
        predictedCategory = predict(x1, xTrainInstances, yTrainCategories,distanceMetric,k)
        lst=list(xTestInstances.index.values)
        toCompar=yTestCategories.loc[lst[x]]
        if predictedCategory==toCompar.values:
            trueCounter+=1
        counter+=1
    acc=trueCounter/counter
    return acc

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

xTitanicInstances_5Features = xTitanicInstances[['Pclass','SibSp','Parch','Fare','Age']]
xTitanicInstances_5Features_100 = xTitanicInstances_5Features.iloc[:100]
yTitanicCategories_100 = yTitanicCategories.iloc[:100]
xTitanicInstances_5Features_Test_10 = xTitanicInstances_5Features.iloc[-20:]
yTitanicCategories_Test_10 = yTitanicCategories.iloc[-20:]
## scale:
xTrainScaled = pd.DataFrame(sc.fit_transform(xTitanicInstances_5Features_100),index=xTitanicInstances_5Features_100.index)
xTestScaled = pd.DataFrame(sc.transform(xTitanicInstances_5Features_Test_10),index=xTitanicInstances_5Features_Test_10.index)

acc_manhattan_3 = evaluate_accuracy(xTitanicInstances_5Features_Test_10,xTitanicInstances_5Features_100,yTitanicCategories_100,yTitanicCategories_Test_10,manhattanDist,3)
acc_scaled_euclidean_3 = evaluate_accuracy(xTestScaled,xTrainScaled,yTitanicCategories_100,yTitanicCategories_Test_10,euclideanDist,3)

assert acc_manhattan_3 == 0.45,"wrong value for accuracy of knn, with k=3, dist='manhattan'"
assert acc_scaled_euclidean_3 == 0.60,"wrong value for accuracy of (Scaled) knn, with k=3, dist='euclidean'"
print ('manhattanDist,k=3, acc: %f' %(acc_manhattan_3))
print ('(Scaled) euclideanDist,k=3, acc: %f' %(acc_scaled_euclidean_3))

manhattanDist,k=3, acc: 0.450000
(Scaled) euclideanDist,k=3, acc: 0.600000
