In [45]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.preprocessing import scale
from collections import Counter
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt 

In [35]:
headers = ["carat",	"cut","color","clarity","depth","table","price","x","y","z"]
data = pd.read_csv('diamonds.csv', na_values='?',    
         header=None,  names = headers) 
data = data.reset_index(drop=True)
data = data.iloc[1:]
# data.describe()

In [36]:
#encode
map_cut = {"Fair":1, "Good":2, "Very Good" : 3, "Premium" : 4, "Ideal" : 5}
map_color = {"J": 1,"I":2, "H":3, "G":4, "F":5, "E":6, "D":7}
map_clarity = {"I1":1, "SI2":2, "SI1":3, "VS2":4, "VS1":5, "VVS2":6, "VVS1":7, "IF":8}

data["map_cut"] = data.cut.map(map_cut)
data["map_color"] = data.color.map(map_color)
data["map_clarity"] = data.clarity.map(map_clarity)

data = data.drop(data.columns[[1,2,3]], axis=1) 
data = data.apply(pd.to_numeric) 
data["table"] = data['table'].astype(float) 

In [37]:
data = data[:1000]

In [38]:
#normalize
def normalize_data(col):
    max_val = col.max()
    min_val = col.min()
    for i in range(len(col)):
        col[i] = (col[i]-min_val)/(max_val-min_val)
    return col

col_to_normalize = ["carat","depth","table", "x","y", "z"]
for col in col_to_normalize:
    data[col] = normalize_data(data[col].values)

In [39]:
# Separate X and Y data
Y = data.price
X = data.drop('price', axis=1)

#Split test train data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=1)

# Changing the index of the records to sequential
X_train.index=range(len(X_train))
Y_train.index=range(len(X_train))
X_test.index=range(len(X_test))
Y_test.index=range(len(Y_test))

In [41]:
#eucledian
def eucledianDistNeighbours(X_train,Y_train,X_test,K):
    distance=[]
    for i in range(len(X_train)):
        eDistance=0
        for j in range(len(X_train.columns)):   
            eDistance+=round(pow((X_train.iloc[i,j]-X_test[j]),2))
        eDistance = np.sqrt(eDistance)
        distance.append((eDistance,i,Y_train.iloc[i]))
        distance=sorted(distance, key=lambda x: x[0])[0:K]
    return distance

# hamming
def hammingDistNeighbours(X_train,Y_train,X_test,K):
    distance=[]
    for i in range(len(X_train)):
        hDistance=0
        for j in range(len(X_train.columns)):   
            hDistance+=round(np.sqrt(pow((X_train.iloc[i,j]-X_test[j]),2)),2)
        distance.append((hDistance,i,Y_train.iloc[i]))
        distance=sorted(distance, key=lambda x: x[0])[0:K]
    return distance

# Manhattan 
def manhattanDistNeighbours(X_train,Y_train,X_test,K):
    distance=[]
    for i in range(len(X_train)):
        mDistance=0
        for j in range(len(X_train.columns)):   
            mDistance+=round(abs(X_train.iloc[i,j]-X_test[j]),2)
        distance.append((mDistance,i,Y_train.iloc[i]))
        distance=sorted(distance, key=lambda x: x[0])[0:K]
    return distance

# Predict the output of the numeric variables based on K nearest neighbours
# Output is the mean of the K nearest neighbours
def predict(X_train,Y_train,X_test,K, dist):
    neighbours=[]
    result=[]
    if(dist=="eucledian"):
        for i in range(len(X_test)):
            neighbours.append(eucledianDistNeighbours(X_train,Y_train,X_test.iloc[i,:],K))
    elif(dist=="manhattan"):
        for i in range(len(X_test)):
            neighbours.append(manhattanDistNeighbours(X_train,Y_train,X_test.iloc[i,:],K))
    elif(dist=="hamming"):
        for i in range(len(X_test)):
            neighbours.append(hammingDistNeighbours(X_train,Y_train,X_test.iloc[i,:],K))
    
    for i in neighbours:
        mean=0
        for j in i:
            mean+=j[-1]
        mean=mean/K
        result.append(mean)
    return result

In [42]:
# Final training
output=predict(X_train,Y_train,X_test,5, "eucledian")

print('MAE from the code: {:^0.2f}'.format(mean_absolute_error(Y_test,output) ,2))

MAE from the code: 70.45


In [None]:
# 1b) Do we need to normalise data? [If so Does it make any difference?].

In [46]:
# Experiment with different distance measures[Euclidean distance, Manhattan distance, Hamming Distance] to handle categorical attributes.
eucledian_output = predict(X_train,Y_train,X_test,5, "eucledian")
manhattan_output = predict(X_train,Y_train,X_test,5, "manhattan")
# hamming_output = predict(X_train,Y_train,X_test,5, "hamming")

def getAllErrors(actual, predicted):
    errors = []
    errors.append(mean_squared_error(actual, predicted, squared=False))
    errors.append(mean_absolute_error(actual, predicted))
    errors.append(r2_score(actual, predicted))
    return errors


eucledianError = getAllErrors(Y_test, eucledian_output)
manhattanError = getAllErrors(Y_test, manhattan_output)
# hammingError = getAllErrors(Y_test, hamming_output)

print (eucledianError)

    

[183.0168680750493, 70.45199999999998, 0.9471838167513922]


In [58]:
headers = ["Eucledian", "Manhattan"]
# headers = ["Eucledian", "Manhattan", "hamming"]
index = ["RMSE", "MAE", "R2-score"]
hammingError = []
list_of_tuples = list(zip(eucledianError, manhattanError))
# list_of_tuples = list(zip(eucledianError, manhattanError, hammingError))
df = pd.DataFrame(list_of_tuples, columns = headers, index=index)
print (df)


           Eucledian   Manhattan
RMSE      183.016868  183.016868
MAE        70.452000   70.452000
R2-score    0.947184    0.947184
