In [114]:
import numpy as np
import pandas as pd

In [115]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [116]:
data=pd.read_csv("kc_house_data_small.csv",dtype=dtype_dict)
train_data=pd.read_csv("kc_house_data_small_train.csv",dtype=dtype_dict)
test_data=pd.read_csv("kc_house_data_small_test.csv",dtype=dtype_dict)
valid_data=pd.read_csv("kc_house_data_validation.csv",dtype=dtype_dict)

In [117]:
feature_list = ['bedrooms', 'bathrooms', 'sqft_living',
                'sqft_lot', 'floors', 'waterfront',  
                'view', 'condition', 'grade', 'sqft_above',  
                'sqft_basement', 'yr_built', 'yr_renovated',  
                'lat', 'long', 'sqft_living15', 'sqft_lot15']

In [118]:
feature_train=train_data[feature_list]
feature_test=test_data[feature_list]
feature_valid=valid_data[feature_list]

In [119]:
feature_train.shape

(5527, 17)

In [120]:
feature_train=np.array(feature_train)
feature_test=np.array(feature_test)
feature_valid=np.array(feature_valid)

In [121]:
feature_train=np.insert(feature_train,0,1,axis=1)
feature_test=np.insert(feature_test,0,1,axis=1)
feature_valid=np.insert(feature_valid,0,1,axis=1)

In [122]:
feature_train.shape

(5527L, 18L)

In [123]:
output=train_data['price']
output=output.reshape(len(output),1)

In [124]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features=features/norms
    return (normalized_features, norms)

In [128]:
p,q=normalize_features(feature_train)
n=feature_test/q

In [129]:
np.sqrt(np.sum((n[0]-p[9])**2))

0.059723593713980783

In [130]:
n

array([[ 0.01345102,  0.01551285,  0.01807473, ..., -0.01346922,
         0.01375926,  0.0016225 ],
       [ 0.01345102,  0.01551285,  0.00602491, ..., -0.0134657 ,
         0.01035159,  0.00174419],
       [ 0.01345102,  0.01163464,  0.01054359, ..., -0.01346141,
         0.00977293,  0.00252907],
       ..., 
       [ 0.01345102,  0.00775643,  0.01355605, ..., -0.01343598,
         0.00797265,  0.00030422],
       [ 0.01345102,  0.01551285,  0.02108718, ..., -0.01344908,
         0.01832425,  0.00231531],
       [ 0.01345102,  0.01163464,  0.01054359, ..., -0.01342817,
         0.00848702,  0.00458478]])

In [131]:
dic={}
for i in range(10):
    d=np.sqrt(np.sum((n[0]-p[i])**2))
    dic[i]=d

In [132]:
min(dic, key=dic.get)

8

In [133]:
dic={}
for i in range(len(feature_train)):
    d=np.sqrt(np.sum((n[2]-p[i])**2))
    dic[i]=d

In [134]:
min(dic, key=dic.get)

382

In [137]:
output[382]

array([ 249000.])

In [141]:
def compute_distances(features_instances, features_query):
    diff = features_instances - features_query
    distances = np.sqrt(np.sum(diff ** 2, axis=1))
    return(distances)

In [142]:
compute_distances(p,n[2])

array([ 0.01954476,  0.06861035,  0.02165079, ...,  0.02433478,
        0.02622734,  0.02637942])

In [148]:
def nearest_neighbors( feature_train, features_query):
    dist=compute_distances(feature_train, features_query)
    sorted_distances_idx = np.argsort(dist)
    neighbors = sorted_distances_idx[0]
    return(neighbors)

In [150]:
nearest_neighbors(p[0:10],n[0])

8

In [145]:
def k_nearest_neighbors(k, feature_train, features_query):
    dist=compute_distances(feature_train, features_query)
    sorted_distances_idx = np.argsort(dist)
    neighbors = sorted_distances_idx[0:k]
    return(neighbors)

In [151]:
d=k_nearest_neighbors(4,p,n[2])

In [152]:
d

array([ 382, 1149, 4087, 3142], dtype=int64)

In [153]:
output[d]

array([[ 249000.],
       [ 477000.],
       [ 499950.],
       [ 430000.]])

In [154]:
np.mean(output[d])

413987.5

In [156]:
dic={}
for i in range(10):
    d=k_nearest_neighbors(10,p,n[i])
    price=np.mean(output[d])
    dic[i]=price
print min(dic, key=dic.get)

6
