In [5]:
"""
k-Nearest Neighbours Regression
"""

# Imports
import numpy as np

# Functions
def get_data(data_frame, features, output):
    """
    Purpose: Extract features and prepare a feature matrix
             Set the first feature x0 = 1
    Input  : Original Dataframe, list of feature variables, output variable
    Output : Feature matrix array, output array
    """
    data_frame['constant'] = 1.0
    features = ['constant'] + features
    features_matrix = np.array(data_frame[features])
    if output != None:    
        output_array = np.array(data_frame[output])
    else:
        output_array = []   
    return(features_matrix, output_array.reshape((len(output_array))))

def normalize(features):
    """
    Purpose: Normalize feature matrix, each column of the matrix is a feature
    Input  : Unnormalized feature matrix
    Output : Normalized feature matrix, feature norms
    """
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features/norms
    return (normalized_features, norms)

def compute_distances(features_instances, features_query):
    """
    Purpose: Compute distances between normalized training features
             and a query feature
    Input  : Normalized training features and normalized
             query features
    Output : Distances between the training and the query features
    """
    diff = features_instances - features_query
    distances = np.sqrt(np.sum(diff ** 2, axis=1))
    return(distances)
    
def k_nearest_neighbors(k, features_train, features_query):
    """
    Purpose: Determine the k nearest neighbours for one query
    Input  : Number of neighbours, normalized training features
             and query feature
    Output : 
    """
    distances = compute_distances(features_train, features_query)
    sorted_distances_idx = np.argsort(distances)
    neighbors = sorted_distances_idx[0:k]
    return(neighbors)
    
def predict_output_of_query(k, features_train, output_train, features_query):
    """
    Purpose: Predict the outputs for one query
    Input  : Number of neighbours, normalized training features,
             training output and normalized query features
    Output : Predictions for query features
    """
    k_nearest = k_nearest_neighbors(k, features_train, features_query)
    prediction = np.mean(output_train[k_nearest])
    return(prediction)

def predict(k, features_train, output_train, features_query):
    """
    Purpose: Predict the outputs for a multiple queries
    Input  : Number of neighbours, normalized training features,
             training output and normalized query features
    Output : Predictions for query features
    """
    predictions = np.zeros((features_query.shape[0]))
    for idx in range(features_query.shape[0]):
        predictions[idx] = predict_output_of_query(k, features_train, output_train, features_query[idx])
    return predictions

def get_residual_sum_of_squares(predictions, output):
    """
    Purpose: Compute residual sum of squares
    Input  : Predicted outputs and actual outputs
    Output : Residual sum of squares
    """
    residual = np.sum((predictions - output) ** 2)
    return(residual)

In [6]:
import numpy as np
import pandas as pd
#import k_nearest_neighbours_regression as knn

# Data type for house sales data
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int,
              'sqft_living15':float, 'grade':int, 'yr_renovated':int,
              'price':float, 'bedrooms':float, 'zipcode':str,
              'long':float, 'sqft_lot15':float, 'sqft_living':float,
              'floors':float, 'condition':int, 'lat':float, 'date':str,
              'sqft_basement':int, 'yr_built':int, 'id':str,
              'sqft_lot':int, 'view':int}

# Read house sales data
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
valid = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
# Feature list
feature_list = ['bedrooms', 'bathrooms', 'sqft_living',
                'sqft_lot', 'floors', 'waterfront',  
                'view', 'condition', 'grade', 'sqft_above',  
                'sqft_basement', 'yr_built', 'yr_renovated',  
                'lat', 'long', 'sqft_living15', 'sqft_lot15']

# Extract training, validation and test data                
features_train, output_train = get_data(train, feature_list, 'price')
features_test, output_test = get_data(test, feature_list, 'price')
features_valid, output_valid = get_data(valid, feature_list, 'price')

# Normalize training, validation and test data
norm_features_train, norms = normalize(features_train)
norm_features_test = features_test / norms
norm_features_valid = features_valid / norms


In [7]:
dist_q_test_10 = np.sqrt(np.sum((norm_features_test[0] - norm_features_train[9]) ** 2.0))
print dist_q_test_10

0.059723593714


In [20]:
norm_features_test

array([[ 0.01345102,  0.01551285,  0.01807473, ..., -0.01346922,
         0.01375926,  0.0016225 ],
       [ 0.01345102,  0.01551285,  0.00602491, ..., -0.0134657 ,
         0.01035159,  0.00174419],
       [ 0.01345102,  0.01163464,  0.01054359, ..., -0.01346141,
         0.00977293,  0.00252907],
       ..., 
       [ 0.01345102,  0.00775643,  0.01355605, ..., -0.01343598,
         0.00797265,  0.00030422],
       [ 0.01345102,  0.01551285,  0.02108718, ..., -0.01344908,
         0.01832425,  0.00231531],
       [ 0.01345102,  0.01163464,  0.01054359, ..., -0.01342817,
         0.00848702,  0.00458478]])