In [182]:
import numpy as np
import pandas as pd

In [183]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [245]:
data = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
#valid_data = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [246]:
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [247]:
def predict(feature_matrix, weights):
    prediction=np.dot(feature_matrix,weights)
    return(prediction)

In [248]:

def normalize(features):
    """
    Purpose: Normalize feature matrix, each column of the matrix is a feature
    Input  : Unnormalized feature matrix
    Output : Normalized feature matrix, feature norms
    """
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features/norms
    return (normalized_features, norms)


In [249]:

def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    """
    Purpose: Compute the descent step for one feature
    Input  : Feature index, normalized feature matrix, output,
             feature weights and L1_penalty
    Output : Descent step for feature
    """
    predictions = np.dot(feature_matrix,weights)
    rho =np.sum(feature_matrix[:,i]*(output-predictions + weights[i]*feature_matrix[:,i]))
    if i==0:
        new_weight = rho
    elif rho < (-l1_penalty/2.0):
        new_weight = rho + (l1_penalty/2.0)
    elif rho > (l1_penalty/2.0):
        new_weight = rho - (l1_penalty/2.0)
    else:
        new_weight = 0.0
    return new_weight


In [250]:
import math
print lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1)

0.425558846691


In [328]:

def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    """
    Purpose: Perform cyclical coordinate descent
    Input  : Normalized feature matrix, output, initial weights,
             L1_penalty and tolerance for stopping the process
    Output : Final weights after the convergence of the coordinate
             descent procedure
    """
    D = feature_matrix.shape[1]
    weights = initial_weights
    change = np.zeros(initial_weights.shape)
    converged = False
    
    while not converged:
        # Evaluate over all features
        for idx in range(D):
            # New weight for feature
            new_weight = lasso_coordinate_descent_step(idx, feature_matrix,
                                                       output, weights,
                                                       l1_penalty)
            # Compute change in weight for feature
            change[idx] = np.abs(new_weight - weights[idx])
            # assign new weight
            weights[idx] = new_weight
        # Maximum change in weight, after all changes have been computed
        max_change = max(change)
        if max_change < tolerance:
            converged = True
    return weights

def fit(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    """
    Purpose: Wrapper for cyclical coordinate descent function
    Input  : Feature matrix array, initial weight vector, output vector,
             tolerance value, l1_penalty
    Output : Estimated weight vector
    """
    weights = lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance)
    return(weights)

def get_residual_sum_of_squares(feature_matrix, weights, output):
    """
    Purpose: Compute Residual Sum of Squares (RSS)
    Input  : Feature matrix, weight vector, output vector
    Output : Residual sum of squares = sum((actual output (y) - predicted output)^2)
    """
    predictions = predict(feature_matrix, weights)
    residual = np.sum((predictions - output) ** 2)
    return(residual)

In [329]:
feature_1=np.array([np.ones(data.shape[0]),data['sqft_living'],data['bedrooms']])
feature_1=feature_1.transpose()

In [330]:
normal_feature,w=normalize(feature_1)

In [331]:
normal_feature

array([[ 0.00680209,  0.00353021,  0.00583571],
       [ 0.00680209,  0.00768869,  0.00583571],
       [ 0.00680209,  0.00230361,  0.00389048],
       ..., 
       [ 0.00680209,  0.00305154,  0.00389048],
       [ 0.00680209,  0.00478673,  0.00583571],
       [ 0.00680209,  0.00305154,  0.00389048]])

In [332]:
output=data['price']
weights=np.array([0.,0.,0.])
l1_penalty=1e7
tolerance=1.0

In [333]:
fit(normal_feature, output, weights, l1_penalty, tolerance)

array([ 21624997.95951872,  63157247.20788978,         0.        ])

In [334]:
get_residual_sum_of_squares(normal_feature, weights, output)

1630492476715378.5

In [335]:
weights=np.array([1.,4.,1.])

In [350]:
prediction=predict_output(normal_feature, weights)
#print prediction
ro=np.sum(normal_feature[:,1]*(data['price']-prediction + weights[1]*normal_feature[:,1]))

In [351]:
2*ro

175878941.64650303

In [348]:
prediction=predict_output(normal_feature, weights)
ro=np.sum(normal_feature[:,2]*(data['price']-prediction + weights[2]*normal_feature[:,2]))

In [349]:
2*ro

161933397.3324781

In [261]:
all_features=['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']

In [262]:
feature_2=np.array(train_data[all_features])
feature_3=np.array(test_data[all_features])
#feature_2=feature_1.transpose()

In [263]:
feature_3.shape

(4229L, 13L)

In [264]:
feature_2=np.append(np.ones(train_data.shape[0]).reshape(train_data.shape[0],1),feature_2,axis=1)
feature_3=np.append(np.ones(test_data.shape[0]).reshape(test_data.shape[0],1),feature_3,axis=1)

In [265]:
feature_3.shape

(4229L, 14L)

In [266]:
feature_2[:,1]

array([ 3.,  3.,  2., ...,  3.,  3.,  2.])

In [267]:
normal_feature2,w=normalize(feature_2)
#normal_feature3,w=normalize(feature_3)

In [296]:
output2=train_data['price']
weights=np.array([0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.])
l1_penalty=1e7
tolerance=1.0

In [297]:
k=fit(normal_feature2, output2, weights, l1_penalty, tolerance)

In [298]:
k

array([ 24429600.23440336,         0.        ,         0.        ,
        48389174.77154855,         0.        ,         0.        ,
         3317511.21492165,   7329961.81171433,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ])

In [299]:
norma,norms=normalize(feature_2)
norms

array([  1.31848398e+02,   4.60040216e+02,   2.96850552e+02,
         2.99962419e+05,   5.81709718e+06,   2.09458827e+02,
         1.15325626e+01,   1.05933942e+02,   4.57793622e+02,
         1.02101959e+03,   2.59726472e+05,   7.01224951e+04,
         2.59922094e+05,   5.36953839e+04])

In [300]:
k1=k/norms
k1

array([  1.85285530e+05,   0.00000000e+00,   0.00000000e+00,
         1.61317458e+02,   0.00000000e+00,   0.00000000e+00,
         2.87664705e+05,   6.91937041e+04,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00])

In [301]:
k1[3]

161.31745764611625

In [302]:
output2=train_data['price']
weights=np.array([0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.])
l1_penalty=1e8
tolerance=1.0

In [303]:
w=k=fit(normal_feature2, output2, weights, l1_penalty, tolerance)

In [304]:
w

array([ 71114625.71488713,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ])

In [305]:
w1=w/norms
w1

array([ 539366.62793373,       0.        ,       0.        ,
             0.        ,       0.        ,       0.        ,
             0.        ,       0.        ,       0.        ,
             0.        ,       0.        ,       0.        ,
             0.        ,       0.        ])

In [306]:
tolerance=5e5
weights=np.array([0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.])
l1_penalty=1e4

In [307]:
q=k=fit(normal_feature2, output2, weights, l1_penalty, tolerance)

In [308]:
q

array([ 78564738.34156857, -22097398.92430511,  12791071.87278492,
        93808088.09281245,  -2013172.75704975,  -4219184.9326501 ,
         6482842.81753503,   7127408.53480684,   5001664.85469704,
        14327518.43714108, -15770959.15237424,  -5159591.22213155,
       -84495341.76843894,   2824439.4970369 ])

In [309]:
q1=q/norms
q1

array([  5.95871771e+05,  -4.80336244e+04,   4.30892643e+04,
         3.12732803e+02,  -3.46078585e-01,  -2.01432664e+04,
         5.62133764e+05,   6.72816325e+04,   1.09255888e+04,
         1.40325598e+04,  -6.07214159e+01,  -7.35796867e+01,
        -3.25079490e+02,   5.26011603e+01])

In [317]:
get_residual_sum_of_squares(feature_2, k1, train_data['price'])

1231595575158386.0

In [311]:
get_residual_sum_of_squares(feature_3, w1, test_data['price'])

537166151497322.4

In [312]:
get_residual_sum_of_squares(feature_3, q1, test_data['price'])

228459958971392.3

In [315]:
feature_2.shape

(17384L, 14L)