In [57]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from math import log, sqrt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import r2_score, mean_squared_error

In [58]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [59]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [60]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

print("Lasso model on whole database \n\nintercept = {} \n\ncoefs =\n {}".format(model_all.intercept_, model_all.coef_))

Lasso model on whole database 

intercept = -218136.21403514093 

coefs =
 [    0.             0.             0.           134.43931396
     0.             0.             0.             0.
     0.             0.         24750.00458561     0.
 61749.10309071     0.             0.            -0.
     0.        ]


In [61]:
def get_numpy_data(data_sframe, features, output):
    
    data_sframe['constant'] = 1 
    features = ['constant'] + features 
    features_sframe = data_sframe[features]
    feature_matrix = features_sframe.to_numpy()
    
    output_sarray = data_sframe[output]
    output_array = output_sarray.to_numpy()
    
    return(feature_matrix, output_array)

In [62]:
def predict_output(feature_matrix, weights):
    
    predictions = np.dot(feature_matrix,weights)
    
    return(predictions)

In [63]:
def normalize_features(feature_matrix):
    
    norms = np.linalg.norm(feature_matrix, axis=0)
    normalized_features = feature_matrix / norms
    
    return(normalized_features, norms)

In [64]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)

In [65]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    
    prediction = predict_output(feature_matrix, weights)
   
    feature_i = feature_matrix[:,i]
    ro_i = sum(feature_i * (output - prediction + weights[i] * feature_i))

    if i == 0: 
        new_weight_i = ro_i 
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [66]:
print (lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                                                  [2./math.sqrt(13),3./math.sqrt(10)]]), 
                                        np.array([1., 1.]), np.array([1., 4.]), 0.1))

0.4255588466910251


In [67]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    
    weights = initial_weights
    change = np.empty(len(weights))
    max_change = tolerance + 1
    while max_change > tolerance:
        for i in range(len(weights)):
            old_weights_i = weights[i]
            weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            change[i] = old_weights_i - weights[i]
        max_change = change.max()
    else:
        print ('maxchange : '+ str(max_change))   
        
    return weights

In [68]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
initial_weights = np.zeros(3)
l1_penalty = 1e7
tolerance = 1.0

(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)
(normalized_simple_feature_matrix, simple_norms) = normalize_features(simple_feature_matrix) # normalize features

weights = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output, initial_weights, l1_penalty, tolerance)
print (("weights = "), weights)

prediction = predict_output(normalized_simple_feature_matrix,weights)
error = prediction - output
RSS = np.dot(error,error)
print (("RSS = "), RSS)

maxchange : 0.8595414943993092
weights =  [21624997.95951872 63157247.20788978        0.        ]
RSS =  1630492476715384.5
