In [1]:
import graphlab

In [2]:
from math import pow
def polynomial_sframe(feature, degree):
    # assume that degree >= 1
    # initialize the SFrame:
    poly_sframe = graphlab.SFrame()
    # and set poly_sframe['power_1'] equal to the passed feature
    poly_sframe['power_1'] = feature
    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        for power in range(2, degree+1):
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # assign poly_sframe[name] to be feature^power
            poly_sframe[name] = poly_sframe['power_1'].apply(lambda x: pow(x,power) )
    return poly_sframe

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

[INFO] This non-commercial license of GraphLab Create is assigned to srb1706@gmail.com and will expire on March 07, 2017. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-1215 - Server binary: /usr/local/lib/python2.7/dist-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1458499049.log
[INFO] GraphLab Server Version: 1.8


In [5]:
sales = graphlab.SFrame('course-2/kc_house_data.gl/')

In [6]:
sales = sales.sort(['sqft_living','price'])

In [7]:
l2_small_penalty = 1e-5

In [13]:
poly15_data = polynomial_sframe(sales['sqft_living'],15)

In [14]:
my_features = poly15_data.column_names()

In [15]:
poly15_data['price'] = sales['price']

In [16]:
model1 = graphlab.linear_regression.create(poly15_data, target = 'price', features = my_features,validation_set=None,l2_penalty=l2_small_penalty)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 21613
PROGRESS: Number of features          : 15
PROGRESS: Number of unpacked features : 15
PROGRESS: Number of coefficients    : 16
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 1.021533     | 2662555.738155     | 245656.462166 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:


In [17]:
model1.get('coefficients')

name,index,value,stderr
(intercept),,167924.857782,424791.915418
power_1,,103.090949863,1985.19757315
power_2,,0.134604555026,3.8921765362
power_3,,-0.000129071368423,0.00423214943723
power_4,,5.18928983008e-08,2.84523002331e-06
power_5,,-7.771693867140001e-12,1.24440342188e-09
power_6,,1.71144993022e-16,3.59891953989e-13
power_7,,4.51177844909e-20,6.71378349869e-17
power_8,,-4.78840025362e-25,6.87651915868e-21
power_9,,-2.33343424709e-28,


In [18]:
(semi_split1, semi_split2) = sales.random_split(.5,seed=0)
(set_1, set_2) = semi_split1.random_split(0.5, seed=0)
(set_3, set_4) = semi_split2.random_split(0.5, seed=0)

In [19]:
set_1_data = polynomial_sframe(set_1['sqft_living'], 15)
set_1_data['price'] = set_1['price']
set_2_data = polynomial_sframe(set_2['sqft_living'], 15)
set_2_data['price'] = set_2['price']
set_3_data = polynomial_sframe(set_3['sqft_living'], 15)
set_3_data['price'] = set_3['price']
set_4_data = polynomial_sframe(set_4['sqft_living'], 15)
set_4_data['price'] = set_4['price']

In [22]:
model_set_1 = graphlab.linear_regression.create(set_1_data, target = 'price', features = ['power_1','power_10','power_11','power_12','power_13','power_14','power_15', 'power_2', 'power_3', 'power_4', 'power_5', 'power_6', 'power_7', 'power_8', 'power_9'
], validation_set = None,l2_penalty=l2_small_penalty)
model_set_2 = graphlab.linear_regression.create(set_2_data, target = 'price', features = ['power_1','power_10','power_11','power_12','power_13','power_14','power_15', 'power_2', 'power_3', 'power_4', 'power_5', 'power_6', 'power_7', 'power_8', 'power_9'
], validation_set = None,l2_penalty=l2_small_penalty)
model_set_3 = graphlab.linear_regression.create(set_3_data, target = 'price', features = ['power_1','power_10','power_11','power_12','power_13','power_14','power_15', 'power_2', 'power_3', 'power_4', 'power_5', 'power_6', 'power_7', 'power_8', 'power_9'
], validation_set = None,l2_penalty=l2_small_penalty)
model_set_4 = graphlab.linear_regression.create(set_4_data, target = 'price', features = ['power_1','power_10','power_11','power_12','power_13','power_14','power_15', 'power_2', 'power_3', 'power_4', 'power_5', 'power_6', 'power_7', 'power_8', 'power_9'
], validation_set = None,l2_penalty=l2_small_penalty)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 5404
PROGRESS: Number of features          : 15
PROGRESS: Number of unpacked features : 15
PROGRESS: Number of coefficients    : 16
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.005636     | 2191984.901432     | 248699.117254 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:
PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 5398
PRO

In [25]:
model_set_1.get('coefficients')


name,index,value,stderr
(intercept),,9306.4672204,835647.46086
power_1,,585.865802811,3639.52512353
power_10,,-6.83813312536e-33,
power_11,,-1.6268623597e-37,
power_12,,2.85118734545e-41,4.40442257146e-37
power_13,,3.7999822114799994e-45,2.8899726295e-41
power_14,,1.5265259637300001e-49,1.2920981134e-45
power_15,,-2.33807313416e-53,2.73764106439e-50
power_2,,-0.397305871588,6.60874938338
power_3,,0.000141470886922,0.00663506718838


In [26]:
model_set_2.get('coefficients')

name,index,value,stderr
(intercept),,-25115.8911926,785826.339267
power_1,,783.493756824,2037.52206216
power_10,,6.218183564609999e-31,
power_11,,6.51741348636e-35,
power_12,,-9.41315802331e-40,
power_13,,-1.02421368464e-42,
power_14,,-1.0039109206099999e-46,
power_15,,1.30113362109e-50,8.45371331638e-49
power_2,,-0.767759249912,
power_3,,0.000438766339151,


In [27]:
model_set_3.get('coefficients')

name,index,value,stderr
(intercept),,462426.557281,673858.144745
power_1,,-759.251830181,
power_10,,-1.7202586202599999e-31,
power_11,,-2.96760984703e-35,
power_12,,-1.06574945814e-39,
power_13,,2.4263565002799998e-43,
power_14,,3.55598703805e-47,
power_15,,-2.85777434274e-51,
power_2,,1.02867005536,
power_3,,-0.000528264552017,


In [28]:
model_set_4.get('coefficients')

name,index,value,stderr
(intercept),,-170240.033092,1233705.7696
power_1,,1247.59034455,7191.34371925
power_10,,1.60198230821e-31,
power_11,,2.3990434645700004e-34,1.30139919037e-30
power_12,,2.33354423771e-38,
power_13,,-1.7987404586e-42,
power_14,,-6.02862593103e-46,
power_15,,4.39472619753e-50,
power_2,,-1.22460911731,16.5051103998
power_3,,0.000555254620377,0.0174343442707


In [29]:
model_set_1 = graphlab.linear_regression.create(set_1_data, target = 'price', features = ['power_1','power_10','power_11','power_12','power_13','power_14','power_15', 'power_2', 'power_3', 'power_4', 'power_5', 'power_6', 'power_7', 'power_8', 'power_9'
], validation_set = None,l2_penalty=1e5)
model_set_2 = graphlab.linear_regression.create(set_2_data, target = 'price', features = ['power_1','power_10','power_11','power_12','power_13','power_14','power_15', 'power_2', 'power_3', 'power_4', 'power_5', 'power_6', 'power_7', 'power_8', 'power_9'
], validation_set = None,l2_penalty=1e5)
model_set_3 = graphlab.linear_regression.create(set_3_data, target = 'price', features = ['power_1','power_10','power_11','power_12','power_13','power_14','power_15', 'power_2', 'power_3', 'power_4', 'power_5', 'power_6', 'power_7', 'power_8', 'power_9'
], validation_set = None,l2_penalty=1e5)
model_set_4 = graphlab.linear_regression.create(set_4_data, target = 'price', features = ['power_1','power_10','power_11','power_12','power_13','power_14','power_15', 'power_2', 'power_3', 'power_4', 'power_5', 'power_6', 'power_7', 'power_8', 'power_9'
], validation_set = None,l2_penalty=1e5)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 5404
PROGRESS: Number of features          : 15
PROGRESS: Number of unpacked features : 15
PROGRESS: Number of coefficients    : 16
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.007634     | 5978778.434729     | 374261.720860 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:
PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 5398
PRO

In [30]:
model_set_1.get('coefficients')

name,index,value,stderr
(intercept),,530317.024516,1257547.1123
power_1,,2.58738875673,5477.03968912
power_10,,3.7891786886999994e-37,
power_11,,2.38223121312e-41,
power_12,,1.4984796921499998e-45,6.6281166945599995e-37
power_13,,9.39161190285e-50,4.34905496047e-41
power_14,,5.84523161981e-54,1.9444494567599997e-45
power_15,,3.60120207203e-58,4.11981460639e-50
power_2,,0.00127414400592,9.94535864973
power_3,,1.74934226932e-07,0.00998496372392


In [31]:
model_set_2.get('coefficients')


name,index,value,stderr
(intercept),,519216.897383,1083041.23141
power_1,,2.04470474182,2808.15275966
power_10,,3.76650326842e-35,
power_11,,3.84228094754e-39,
power_12,,3.98520828414e-43,
power_13,,4.1827276239400004e-47,
power_14,,4.4273833287799996e-51,
power_15,,4.71518245412e-55,1.16510730459e-48
power_2,,0.0011314362684,
power_3,,2.93074277549e-07,


In [32]:
model_set_3.get('coefficients')


name,index,value,stderr
(intercept),,522911.518048,939367.078576
power_1,,2.26890421877,
power_10,,1.95410800249e-35,
power_11,,1.92734119456e-39,
power_12,,1.9148369901299998e-43,
power_13,,1.91102277046e-47,
power_14,,1.91246242302e-51,
power_15,,1.91699558035e-55,
power_2,,0.00125905041842,
power_3,,2.77552918155e-07,


In [33]:
model_set_4.get('coefficients')


name,index,value,stderr
(intercept),,513667.087087,1631425.5311
power_1,,1.91040938244,9509.67567434
power_10,,1.25315224143e-34,
power_11,,1.43600781402e-38,1.7209418304e-30
power_12,,1.662699678e-42,
power_13,,1.9398172453e-46,
power_14,,2.2754148577e-50,
power_15,,2.67948784897e-54,
power_2,,0.00110058029175,21.8259970597
power_3,,3.12753987879e-07,0.0230547956101


In [34]:
(train_valid, test) = sales.random_split(.9, seed=1)
train_valid_shuffled = graphlab.toolkits.cross_validation.shuffle(train_valid, random_seed=1)

In [43]:
def extract_segment(data, k, i):
    n = len(data)
    start = (n*i)/k
    end = (n*(i+1))/k-1
    return data[start:end+1]

In [44]:
validation4 = extract_segment(train_valid_shuffled, 10, 3)

In [45]:
print int(round(validation4['price'].mean(), 0))

536234


In [42]:
def extract_train(data, k, i):
    n = len(data)
    start = (n*i)/k
    end = (n*(i+1))/k-1
    first_two = data[0:start]
    last_two = data[end+1:n]
    return first_two.append(last_two)

In [46]:
train4 = extract_train(train_valid_shuffled, 10, 3)

In [54]:
def get_residual_sum_of_squares(model, data, outcome):
    RSS = None
    data['prediction'] = model.predict(data)
    data['error'] = outcome - data['prediction']
    error_sq = data['error'] * data['error']
    RSS = error_sq.sum()
    return(RSS)


In [55]:
def k_fold_cross_validation(k, l2_pena, data, output_name, features_list, verbose=False):
    degree = 15
    rss_sum = 0.
    for i in range(0, k):
        validation = extract_segment(data, k, i)
        training = extract_train(data, k, i)

        poly_data = polynomial_sframe(training['sqft_living'], degree)
        my_features = poly_data.column_names()
        poly_data['price'] = training['price']
        model = graphlab.linear_regression.create(poly_data,
                                                  target = 'price',
                                                  features = my_features,
                                                  l2_penalty = l2_pena,
                                                  validation_set = None,
                                                  verbose = False)
        # validation
        poly_validation = polynomial_sframe(validation[features_list[0]], degree)
        rss = get_residual_sum_of_squares(model, poly_validation, validation[output_name])

        rss_sum += rss
        print("  Segment %d of %d: l2_pena = %f, avg[train X1 = %f, train Y = %f, validation X1 = %f], RSS = %f" % (i, k, l2_pena, training['sqft_living'].mean(), training['price'].mean(), validation['sqft_living'].mean(), rss))
    print("%d-folding, Avg. RSS = %f, L2 penalty = %f" % (k, (rss_sum/k), l2_pena))

In [56]:
import numpy as np
for l2_penalty in np.logspace(1, 7, num=13):
    print(l2_penalty)
    k_fold_cross_validation(10, l2_penalty, train_valid_shuffled, 'price', ['sqft_living'])

10.0
  Segment 0 of 10: l2_pena = 10.000000, avg[train X1 = 2080.451853, train Y = 540389.490921, validation X1 = 2051.538938], RSS = 109834589205783.812500
  Segment 1 of 10: l2_pena = 10.000000, avg[train X1 = 2078.267530, train Y = 539173.946437, validation X1 = 2071.208247], RSS = 111279322217425.171875
  Segment 2 of 10: l2_pena = 10.000000, avg[train X1 = 2079.003895, train Y = 540672.310821, validation X1 = 2064.575039], RSS = 95736012566977.421875
  Segment 3 of 10: l2_pena = 10.000000, avg[train X1 = 2081.329514, train Y = 539450.435151, validation X1 = 2043.656701], RSS = 122811521665357.593750
  Segment 4 of 10: l2_pena = 10.000000, avg[train X1 = 2075.423178, train Y = 538502.681886, validation X1 = 2096.801546], RSS = 111668727442302.906250
  Segment 5 of 10: l2_pena = 10.000000, avg[train X1 = 2075.797674, train Y = 538141.215100, validation X1 = 2093.440949], RSS = 3835158675132276.500000
  Segment 6 of 10: l2_pena = 10.000000, avg[train X1 = 2075.232757, train Y = 53826

In [59]:
l2_best_penalty = 1000.
k = 10
degree = 15
poly_data = polynomial_sframe(train_valid_shuffled['sqft_living'], degree)
my_features = poly_data.column_names()
poly_data['price'] = train_valid_shuffled['price']
model = graphlab.linear_regression.create(poly_data,
                                          target = 'price',
                                          features = my_features,
                                          l2_penalty = l2_best_penalty,
                                          validation_set = None,
                                          verbose = False)

poly_testing = polynomial_sframe(test['sqft_living'], degree)
rss = get_residual_sum_of_squares(model, poly_testing, test['price'])
print "%d-folding, Avg. RSS = %f, L2 penalty = %f" % (k, rss, l2_best_penalty)

10-folding, Avg. RSS = 128780855058449.421875, L2 penalty = 1000.000000
