In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install turicreate
import turicreate as tc

In [None]:
def polynomial_sframe(feature, degree):
    # assume that degree >= 1
    # initialize the SFrame:
    poly_sframe = tc.SFrame()
    # and set poly_sframe['power_1'] equal to the passed feature
    poly_sframe['power_1'] = feature
    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        # range usually starts at 0 and stops at the endpoint-1. We want it to start at 2 and stop at degree
        for power in range(2, degree+1): 
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # then assign poly_sframe[name] to the appropriate power of feature
            poly_sframe[name] = feature ** power
    return poly_sframe


In [None]:
tmp = tc.SArray([1., 2., 3.])
print (polynomial_sframe(tmp, 3))

In [None]:
sales = tc.SFrame('../input/house-data')

In [None]:
sales = sales.sort(['sqft_living', 'price'])

In [None]:
poly1_data = polynomial_sframe(sales['sqft_living'], 1)
poly1_data['price'] = sales['price'] # add price to the data since it's the target

In [None]:
print (poly1_data)

In [None]:
model1 = tc.linear_regression.create(poly1_data, target = 'price', features = ['power_1'], validation_set = None)


In [None]:
model1.coefficients

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(poly1_data['power_1'],poly1_data['price'],'.',
        poly1_data['power_1'], model1.predict(poly1_data),'-')

In [None]:
poly15_data = polynomial_sframe(sales['sqft_living'], 15)
poly15_features = poly15_data.column_names() # get the name of the features
poly15_data['price'] = sales['price'] # add price to the data since it's the target
model15 = tc.linear_regression.create(poly15_data, target = 'price', features = poly15_features, validation_set = None)

In [None]:
plt.plot(poly15_data['power_1'],poly15_data['price'],'.',
        poly15_data['power_1'], model15.predict(poly15_data),'-')

In [None]:
bigset_1, bigset_2 = sales.random_split(0.5, seed=0)
set_1, set_2 = bigset_1.random_split(0.5, seed=0)
set_3, set_4 = bigset_2.random_split(0.5, seed=0)

In [None]:
def get_poly_model(set_data):
    poly15_data = polynomial_sframe(set_data['sqft_living'], 15)
    poly15_features = poly15_data.column_names() # get the name of the features
    poly15_data['price'] = set_data['price'] # add price to the data since it's the target
    model15 = tc.linear_regression.create(poly15_data, target = 'price', features = poly15_features, validation_set = None)
    return poly15_data, model15
    

def get_coef(set_data):
    poly15_data, model15 = get_poly_model(set_data)
    return model15.coefficients

def plot_fitted_line(set_data):
    poly15_data, model15 = get_poly_model(set_data)
    return plt.plot(poly15_data['power_1'],poly15_data['price'],'.',
           poly15_data['power_1'], model15.predict(poly15_data),'-')

In [None]:
set_1_coef = get_coef(set_1)
print (set_1_coef[set_1_coef['name'] == 'power_15'])

In [None]:
plot_fitted_line(set_1)


In [None]:
set_2_coef = get_coef(set_2)
print (set_2_coef[set_2_coef['name'] == 'power_15'])

In [None]:
plot_fitted_line(set_2)

In [None]:
set_3_coef = get_coef(set_3)
print (set_3_coef[set_3_coef['name'] == 'power_15'])

In [None]:
plot_fitted_line(set_3)

In [None]:
set_4_coef = get_coef(set_4)
print (set_4_coef[set_4_coef['name'] == 'power_15'])

In [None]:
plot_fitted_line(set_4)

In [None]:
training_and_validation_data, test_data = sales.random_split(0.9, seed=1)
train_data, validation_data = training_and_validation_data.random_split(0.5, seed=1)

In [None]:
arr = []
for degree in range(1, 16):
    poly_data = polynomial_sframe(train_data['sqft_living'], degree)      
    my_features = poly_data.column_names()
    poly_data['price'] = train_data['price']
    model = tc.linear_regression.create(poly_data, target = 'price', features = my_features, 
                                              validation_set = None, verbose = False)
    validation_data_poly = polynomial_sframe(validation_data['sqft_living'], degree)
    predictions = model.predict(validation_data_poly)
    residuals = validation_data['price'] - predictions
    rss = sum(residuals * residuals)
    arr.append(rss)
#     print degree
#     print rss 

# for degree, rss in enumerate(arr):
#     print degree, rss

# Note that list index starts from 0, so degree = index + 1
print (arr.index(min(arr)), min(arr))

In [None]:
arr2 = []
for degree in range(1, 16):
    poly_data = polynomial_sframe(train_data['sqft_living'], degree)      
    my_features = poly_data.column_names()
    poly_data['price'] = train_data['price']
    model = tc.linear_regression.create(poly_data, target = 'price', features = my_features, 
                                              validation_set = None, verbose = False)
    test_data_poly = polynomial_sframe(test_data['sqft_living'], degree)
    predictions = model.predict(test_data_poly)
    residuals = test_data['price'] - predictions
    rss_test = sum(residuals * residuals)
    arr2.append(rss_test)
#     print degree
#     print rss 

for degree, rss in enumerate(arr2):
    print (degree, rss)

print (arr2.index(min(arr2)), min(arr2))

In [None]:
print (arr2[6])