In [319]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [320]:
sales=pd.read_csv('data_set/kc_house_data_small.csv')
sales.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [321]:
# Split the data into training,testing and Validation set
train_data=pd.read_csv('data_set/kc_house_data_small_train.csv')
test_data=pd.read_csv('data_set/kc_house_data_small_test.csv')
validation_data=pd.read_csv('data_set/kc_house_data_validation.csv')
train_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [322]:
# Functoion to return the feature matrix along with the output_array
def get_numpy_data(data_set,features,output):
    data_set['constant']=1
    features=['constant']+features
    feature_matrix=np.array(data_set[features])
    output_array=np.array(data_set[output])
    return feature_matrix,output_array

In [323]:
# Given a feature matrix divide each column by its 2-norm
def normalize_feature(features):
    norms=np.linalg.norm(features,axis=0)
    normalized_features=features/norms
    return normalized_features,norms

In [324]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15']

In [325]:
train_feature_matrix,train_output=get_numpy_data(train_data,features,'price')
test_feature_matrix,test_output=get_numpy_data(test_data,features,'price')
validation_feature_matrix,validation_output=get_numpy_data(validation_data,features,'price')

In [326]:
train_feature_matrix

array([[ 1.00000e+00,  3.00000e+00,  1.00000e+00, ..., -1.22257e+02,
         1.34000e+03,  5.65000e+03],
       [ 1.00000e+00,  3.00000e+00,  2.25000e+00, ..., -1.22319e+02,
         1.69000e+03,  7.63900e+03],
       [ 1.00000e+00,  2.00000e+00,  1.00000e+00, ..., -1.22233e+02,
         2.72000e+03,  8.06200e+03],
       ...,
       [ 1.00000e+00,  3.00000e+00,  2.50000e+00, ..., -1.21881e+02,
         2.27000e+03,  5.73100e+03],
       [ 1.00000e+00,  4.00000e+00,  2.50000e+00, ..., -1.22167e+02,
         2.52000e+03,  6.02300e+03],
       [ 1.00000e+00,  3.00000e+00,  2.50000e+00, ..., -1.22346e+02,
         1.53000e+03,  1.50900e+03]])

### NOTE :
It is crucial to normalize features. Otherwise, for example, the ‘sqft_living’ feature (typically on the order of thousands) would exert a much larger influence on distance than the ‘bedrooms’ feature (typically on the order of ones). We divide each column of the training feature matrix by its 2-norm, so that the transformed column has unit norm.

In [327]:
train_feature_matrix,train_norms=normalize_feature(train_feature_matrix)
test_feature_matrix=test_feature_matrix/train_norms
validation_feature_matrix=validation_feature_matrix/train_norms

In [328]:
train_feature_matrix

array([[ 0.01345102,  0.01163464,  0.00602491, ..., -0.01345623,
         0.00861561,  0.00229178],
       [ 0.01345102,  0.01163464,  0.01355605, ..., -0.01346306,
         0.01086596,  0.00309857],
       [ 0.01345102,  0.00775643,  0.00602491, ..., -0.01345359,
         0.0174884 ,  0.00327015],
       ...,
       [ 0.01345102,  0.01163464,  0.01506227, ..., -0.01341485,
         0.0145951 ,  0.00232464],
       [ 0.01345102,  0.01551285,  0.01506227, ..., -0.01344633,
         0.01620249,  0.00244308],
       [ 0.01345102,  0.01163464,  0.01506227, ..., -0.01346603,
         0.00983723,  0.00061209]])

## 1.
To start, let's just explore computing the “distance” between two given houses. We will take our query house to be the first house of the test set and look at the distance between this house and the 10th house of the training set.

In [329]:
print(test_feature_matrix[0].ndim,train_feature_matrix[9])

1 [ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [330]:
def euclidean_distance(xnn,xquery):
    sub_data=(xnn-xquery)**2
   # sub_data=sub_data.reshape(1,-1)
    sum_data=np.sum(sub_data)
    euclidean_dist=np.sqrt(sum_data)
    return euclidean_dist

### Quiz Question: What is the Euclidean distance between the query house and the 10th house of the training set?

In [331]:
euclid_dist=euclidean_distance(train_feature_matrix[9],test_feature_matrix[0])
euclid_dist

0.05972359371398078

## 2.
To visualize this nearest-neighbor search, let's first compute the distance from our query house (features_test[0]) to the first 10 houses of the training set (features_train[0:10]) and then search for the nearest neighbor within this small set of houses. Through restricting ourselves to a small set of houses to begin with, we can visually scan the list of 10 distances to verify that our code for finding the nearest neighbor is working.

In [332]:
euclid10_dist=euclidean_distance(train_feature_matrix[0],test_feature_matrix[0])
euclid10_dist

0.06027470916295592

In [333]:
def ten_nn_distance(data_set,query_data):
    euclid_distances=[]
    for i in range(0,10):
        euclid_distances.append(euclidean_distance(data_set[i],query_data[0]))
    return euclid_distances

### Quiz Question: Among the first 10 training houses, which house is the closest to the query house?

In [334]:
euclidean_distances=ten_nn_distance(train_feature_matrix,test_feature_matrix)
euclidean_distances
euclidean_distances.index(min(euclidean_distances))

8

In [335]:
print(train_feature_matrix[0:3] - test_feature_matrix[0])

[[ 0.00000000e+00 -3.87821276e-03 -1.20498190e-02 -1.05552733e-02
   2.08673616e-04 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
   0.00000000e+00 -3.47633726e-03 -5.50336860e-03 -2.48168183e-02
  -1.63756198e-04  0.00000000e+00 -1.70254220e-05  1.29876855e-05
  -5.14364795e-03  6.69281453e-04]
 [ 0.00000000e+00 -3.87821276e-03 -4.51868214e-03 -2.26610387e-03
   7.19763456e-04  0.00000000e+00  0.00000000e+00 -5.10236549e-02
   0.00000000e+00 -3.47633726e-03  1.30705004e-03 -1.45830788e-02
  -1.91048898e-04  6.65082271e-02  4.23090220e-05  6.16364736e-06
  -2.89330197e-03  1.47606982e-03]
 [ 0.00000000e+00 -7.75642553e-03 -1.20498190e-02 -1.30002801e-02
   1.60518166e-03 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
   0.00000000e+00 -5.21450589e-03 -8.32384500e-03 -2.48168183e-02
  -3.13866046e-04  0.00000000e+00  4.70885840e-05  1.56292487e-05
   3.72914476e-03  1.64764925e-03]]


In [336]:
subtr_data=(train_feature_matrix[0:9] - test_feature_matrix[0])**2
#subtr_data=subtr_data.reshape(1,-1)
sum_data=subtr_data.sum(axis=1)
np.sqrt(sum_data)

array([0.06027471, 0.08546881, 0.06149946, 0.05340274, 0.05844484,
       0.05987922, 0.0546314 , 0.05543108, 0.05238363])

In [337]:
diff=train_feature_matrix - test_feature_matrix[0] 
diff[-1].sum()

-0.09343399874654643

In [338]:
distances=np.sqrt(np.sum(diff**2,axis=1))

In [339]:
distances[100]

0.023708232416678195

In [340]:
np.sum(diff[15]**2)

0.0033070590284564453

In [341]:
def compute_distances(features_instances,features_query):
    diff=features_instances - features_query
    distances=np.sqrt(np.sum(diff**2,axis=1))
    return distances

### Quiz Question: Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?

In [342]:
distances=compute_distances(train_feature_matrix,test_feature_matrix[2])
distances.argmin()

382

### Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?

In [347]:
train_output[382]

249000

# Perform k-nearest neighbor regression

In [348]:
def k_nearest_neighbors(k,feature_train,features_query):
    distances=compute_distances(feature_train,features_query)
    element=1
    neighbors=[]
    
    distances_list=list(distances)
    indices=np.argsort(distances)
    return indices[0:k+1]

In [349]:
indx=k_nearest_neighbors(2,train_feature_matrix,test_feature_matrix[2])
indx

array([ 382, 1149, 4087], dtype=int64)

### Quiz Question: Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?

In [350]:
indx=k_nearest_neighbors(4,train_feature_matrix,test_feature_matrix[2])
indx

array([ 382, 1149, 4087, 3142, 2751], dtype=int64)

In [351]:
def predict_output_of_query(k,features_train,train_output,features_query):
    indx=k_nearest_neighbors(k,features_train,features_query)
    prices=[]
    for i in indx:
        prices.append(train_output[i])
    prediction=np.mean(prices)
    return prediction

### Quiz Question: Again taking the query house to be third house of the test set (features_test[2]), predict the value of the query house using k-nearest neighbors with k=4 and the simple averaging method described and implemented above.

In [352]:
price=predict_output_of_query(4,train_feature_matrix,train_output,test_feature_matrix[2])
price

381190.0

## 3. 
 Finally, write a function to predict the value of each and every house in a query set. (The query set can be any subset of the dataset, be it the test set or validation set.) The idea is to have a loop where we take each house in the query set as the query house and make a prediction for that specific house. 

In [353]:
def predict_output(k,features_train,train_output,features_query):
    predictions=[]
    for feature_query in features_query:
        predictions.append(predict_output_of_query(k,features_train,train_output,feature_query))
    return predictions

### Quiz Question: Make predictions for the first 10 houses in the test set, using k=10. What is the index of the house in this query set that has the lowest predicted value? What is the predicted value of this house?

In [354]:
predict_output(10,train_feature_matrix,train_output,test_feature_matrix[0:10])

[939818.1818181818,
 476690.9090909091,
 456904.54545454547,
 442000.0,
 757045.4545454546,
 666063.6363636364,
 357301.8181818182,
 502541.54545454547,
 465450.0,
 451940.9090909091]

In [355]:
def rss_value(predict,output):
    diff=(output-predict)**2
    rss=diff.sum()
    return rss

In [356]:
rss=[]
for k in range(1,16):
    predictions=predict_output(k,train_feature_matrix,train_output,validation_feature_matrix)
    rss.append(rss_value(predictions,validation_output))
rss.index(min(rss))

6

In [357]:
predictions=predict_output(12,train_feature_matrix,train_output,test_feature_matrix)

In [358]:
rss_value(predictions,test_output)

132766702659606.86