# Homework 6

In [1]:
import numpy as np
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [4]:
sales = pd.read_csv('./kc_house_data_small.csv', dtype = dtype_dict)
train = pd.read_csv('./kc_house_data_small_train.csv', dtype = dtype_dict)
test = pd.read_csv('./kc_house_data_small_test.csv', dtype = dtype_dict)
validation = pd.read_csv('./kc_house_data_validation.csv', dtype = dtype_dict)

In [10]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    # features_sframe = data_sframe[features]
    
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = data_sframe[features].as_matrix()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_array = data_sframe[output].as_matrix()

    # this will convert the SArray into a numpy array:
    # output_array = output_sarray.to_numpy(output) # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [11]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features, norms)

In [12]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
features_train, output_train = get_numpy_data(train, feature_list, 'price')
features_test, output_test = get_numpy_data(test, feature_list, 'price')
features_valid, output_valid = get_numpy_data(validation, feature_list, 'price')

features_train, norms = normalize_features(features_train) # normalize training set features (columns)
features_test = features_test / norms # normalize test set by training set norms
features_valid = features_valid / norms # normalize validation set by training set norms

In [13]:
print np.sqrt(np.sum((features_train[9]-features_test[0])**2))

0.059723593714


In [63]:
for i in range(10):
    print '[' + str(i) + ']'
    print np.sqrt(np.sum((features_train[i]-features_test[2])**2)) # 0 and 2

[0]
0.0195447561975
[1]
0.0686103471134
[2]
0.0216507855562
[3]
0.0128338471371
[4]
0.0199091755142
[5]
0.0191286493485
[6]
0.00543398934658
[7]
0.0293867517221
[8]
0.0124793728949
[9]
0.0218369022963


In [24]:
# verify that vectorization works
results = features_train[0:3] - features_test[0]
print results[0] - (features_train[0]-features_test[0])
# should print all 0's if results[0] == (features_train[0]-features_test[0])
print results[1] - (features_train[1]-features_test[0])
# should print all 0's if results[1] == (features_train[1]-features_test[0])
print results[2] - (features_train[2]-features_test[0])
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [36]:
diff = features_train[0:len(features_train)] - features_test[0] # return the feature differences between the query and last training house

In [37]:
diff[-1].sum()

-0.093433998746546426

In [33]:
def compute_distances(features_instances, features_query):
    diff = features_instances[0:len(features_instances)] - features_query
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return distances

In [44]:
np.sum(diff**2, axis=1)

array([ 0.00363304,  0.00730492,  0.00378218, ...,  0.0032681 ,
        0.00325555,  0.00341846])

In [45]:
distances = compute_distances(features_train, features_test[2])
min = distances[0]
index = 0
for i in xrange(len(distances)):
    if(distances[i] < min):
        min = distances[i]
        index = i
print min
print index

0.00286049555751
382


In [46]:
print output_train[382]

249000.0


In [48]:
def k_nearest_neighbors(k, feature_train, features_query):
    distances = compute_distances(features_train, features_query)
    neighbors = np.argsort(distances)[0:k] # np.argsort -> return the indices that will sort the array
    return neighbors

In [49]:
print k_nearest_neighbors(4, features_train, features_test[2])

[ 382 1149 4087 3142]


In [50]:
def predict_output_of_query(k, features_train, output_train, features_query):
    neighbors = k_nearest_neighbors(k, features_train, features_query)
    prices = output_train[neighbors]
    prediction = np.sum(prices)/k
    return prediction

In [51]:
print predict_output_of_query(4, features_train, output_train, features_test[2])

413987.5


In [52]:
def predict_output(k, features_train, output_train, features_query):
    predictions = []
    for i in xrange(len(features_query)):
        prediction = predict_output_of_query(k, features_train, output_train, features_query[i])
        predictions.append(prediction)
    return predictions

In [55]:
print predict_output(10, features_train, output_train,features_test[0:10])

[881300.0, 431860.0, 460595.0, 430200.0, 766750.0, 667420.0, 350032.0, 512800.70000000001, 484000.0, 457235.0]


In [57]:
for k in range(16):
    RSS = 0
    predictions = predict_output(k, features_train, output_train, features_valid)
    RSS = np.sum((output_valid - predictions)**2)
    print k
    print RSS

0
nan
1
1.05453830252e+14
2
8.3445073504e+13
3
7.26920960192e+13
4
7.19467216521e+13
5
6.98465174197e+13
6
6.88995443532e+13
7
6.83419734501e+13
8
6.73616787355e+13
9
6.8372727959e+13
10
6.93350486686e+13
11
6.95238552156e+13
12
6.90499695872e+13
13
7.00112545083e+13
14
7.0908698869e+13
15
7.11069283859e+13


In [None]:
# k = 8 lowest RSS

In [58]:
RSS = 0
predictions = predict_output(8, features_train, output_train, features_test)
RSS = np.sum((output_test - predictions)**2)
print RSS

1.33118823552e+14
