In [94]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV


In [163]:
def encode_grade(x):
    '''
    This is a helper function for load_and_format_x_y()
    This function takes in a dataframe of x predictors. 
    Initiates a OneHotEncoder with handle_unknown="ignore" to 
    compensate for unseen artists in new data, 
    And returns the dataframe with the artist column encoded.
    '''
    # initiate OHE
    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    # dataframe of just encoded columns, using x_train index as it's index to ensure rows line up
    ohe.fit(x[['grade']])
    grade_dummies = pd.DataFrame(ohe.transform(x[['grade']]).todense(), columns=ohe.get_feature_names(), index=x.index)
    # concat encoded columns to x
    concatted_x = pd.concat([x, grade_dummies], axis=1)
    #add column names
    grade_nums = [1,3,4,5,6,7,8,9,10,11,12,13]
    grades = []
    for num in grade_nums:
        grades.append(f'grade_{num}')
    concatted_x.columns = list(x.columns) + list(grades)
    concatted_x = concatted_x.drop('grade', axis=1)
    return concatted_x

In [227]:
def encode_grade_all(x):
    '''
    This is a helper function for load_and_format_x_y()
    This function takes in a dataframe of x predictors. 
    Initiates a OneHotEncoder with handle_unknown="ignore" to 
    compensate for unseen artists in new data, 
    And returns the dataframe with the artist column encoded.
    '''
    
    # initiate OHE
    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    # dataframe of just encoded columns, using x_train index as it's index to ensure rows line up
    ohe.fit(x[['grade']])
    grade_dummies = pd.DataFrame(ohe.transform(x[['grade']]).todense(), columns=ohe.get_feature_names(), index=x.index)
    # concat encoded columns to x
    concatted_x = pd.concat([x, grade_dummies], axis=1)
    #add column names
    grade_nums = list(x['grade'].value_counts().index)
    grades = []
    for num in grade_nums:
        grades.append(f'grade_{num}')
    concatted_x.columns = list(x.columns) + list(grades)
    concatted_x = concatted_x.drop('grade', axis=1)
    return concatted_x

In [228]:
def preprocess_data(data):
    data = encode_grade_all(data)
    data = data.drop('Unnamed: 0', axis=1)
    data['ratio'] = data['sqft_living']/data['sqft_lot']
    x = data.drop(['zipcode', 'lat', 'long'], axis=1)
    x = x.set_index('id')
    x = x.drop('date', axis=1)
    x_cols = x.columns
    ss = StandardScaler()
    x = pd.DataFrame(ss.fit_transform(x))
    x.columns = x_cols
    return x

# Final Model

In [229]:
data = pd.read_csv('kc_house_data_train.csv')

In [217]:
y = data['price']
x = preprocess_data(data.drop('price', axis=1))

In [218]:
gbr = GradientBoostingRegressor(n_estimators=500,  min_samples_split=3, verbose=1)
gbr.fit(x, y)

      Iter       Train Loss   Remaining Time 
         1 124322210290.3152           35.53s
         2 111701008476.3752           39.46s
         3 101254208302.4301           36.24s
         4 92475683471.5541           34.18s
         5 84937246168.4440           32.40s
         6 78693018236.1423           30.72s
         7 73249783387.2247           29.80s
         8 68531836209.5698           29.46s
         9 64579131671.8702           28.75s
        10 61117977502.4731           28.31s
        20 42346326683.0487           24.34s
        30 35765083784.5238           23.67s
        40 32643219173.5307           23.59s
        50 30932325714.3272           24.50s
        60 29658480914.4235           23.85s
        70 28870600578.5589           22.92s
        80 28101710538.6183           22.42s
        90 27551203148.4621           21.78s
       100 27060794540.0503           21.40s
       200 23772534729.7621           17.89s
       300 21733006178.2155           11.64s
      

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=3,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=1, warm_start=False)

# Holdout Predictions


### load and preprocess data

In [219]:
data = pd.read_csv('kc_house_data_test_features.csv')

In [223]:
x_test = preprocess_holdout(data)

In [224]:
x_test

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,...,grade_9,grade_10,grade_6,grade_11,grade_5,grade_12,grade_4,grade_13,grade_1,ratio
0,0.676412,0.227557,0.089644,-0.030766,-1.260349,-0.080742,-0.282951,-0.447707,-0.442861,1.182544,...,-0.087706,-0.25089,-0.706371,1.397451,-0.445537,-0.270131,-0.153883,-0.057,-0.015211,-0.748766
1,0.676412,0.227557,0.089644,-0.030766,-1.260349,-0.080742,-0.282951,-0.447707,-0.442861,1.182544,...,-0.087706,-0.25089,-0.706371,1.397451,-0.445537,-0.270131,-0.153883,-0.057,-0.015211,-0.748766
2,-0.427449,0.227557,-0.737950,-0.269995,0.445769,-0.080742,-0.282951,-0.447707,-0.867374,0.154414,...,-0.087706,-0.25089,-0.706371,1.397451,-0.445537,-0.270131,-0.153883,-0.057,-0.015211,0.967037
3,-0.427449,-0.731556,-0.934504,0.084898,-1.260349,-0.080742,-0.282951,-0.447707,-1.012602,0.007539,...,-0.087706,-0.25089,-0.706371,1.397451,-0.445537,-0.270131,-0.153883,-0.057,-0.015211,-1.071727
4,0.676412,0.547262,0.668960,-0.113799,0.445769,-0.080742,-0.282951,-0.447707,0.998248,-0.604444,...,-0.087706,-0.25089,-0.706371,1.397451,-0.445537,-0.270131,-0.153883,-0.057,-0.015211,-0.337154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4318,-0.427449,0.227557,-0.675881,-0.285942,2.151888,-0.080742,-0.282951,-0.447707,-0.454033,-0.604444,...,-0.087706,-0.25089,-0.706371,1.397451,-0.445537,-0.270131,-0.153883,-0.057,-0.015211,2.403372
4319,0.676412,0.227557,0.131024,-0.170720,0.445769,-0.080742,-0.282951,-0.447707,0.417336,-0.604444,...,-0.087706,-0.25089,-0.706371,1.397451,-0.445537,-0.270131,-0.153883,-0.057,-0.015211,-0.203144
4320,-1.531311,-2.010374,-1.203472,-0.280553,0.445769,-0.080742,-0.282951,-0.447707,-1.023773,-0.604444,...,-0.087706,-0.25089,1.415687,-0.715589,-0.445537,-0.270131,-0.153883,-0.057,-0.015211,0.774014
4321,-0.427449,0.227557,-0.603466,-0.255008,0.445769,-0.080742,-0.282951,-0.447707,-0.375833,-0.604444,...,-0.087706,-0.25089,-0.706371,1.397451,-0.445537,-0.270131,-0.153883,-0.057,-0.015211,0.540648


In [243]:
predictions = pd.DataFrame(gbr_pca.predict(x_test))
predictions.to_csv('predictions.csv')

In [225]:
gbr.predict(x_test)

ValueError: Number of features of the model must match the input. Model n_features is 16 and input n_features is 26 

# Model Building

In [230]:
data = pd.read_csv('kc_house_data_train.csv')
data = data.drop('Unnamed: 0', axis=1)

In [231]:
data['ratio'] = data['sqft_living']/data['sqft_lot']

In [232]:
data = encode_grade(data)

In [233]:
x = data.drop(['price', 'zipcode', 'lat', 'long'], axis=1)
y = data['price']
# x_test = pd.read_csv('kc_house_data_test_features.csv')

In [234]:
x = x.set_index('id')
x = x.drop('date', axis=1)


In [235]:
ss = StandardScaler()
x = pd.DataFrame(ss.fit_transform(x))


In [236]:
x_train, x_test, y_train, y_test = train_test_split(x,y)


In [237]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x,y)


# Baseline

In [238]:
lr = LinearRegression()
lr.fit(x_train, y_train)
lr.score(x_train, y_train)

0.6984464197541488

In [239]:
lr.score(x_test, y_test)

0.6766681138549764

# Gradient boost

In [66]:
gbr = GradientBoostingRegressor()
gbr.fit(x_train, y_train)
gbr.score(x_train, y_train)

0.796222642000603

In [67]:
cross_val_score(gbr, x_test, y_test)

array([0.68693847, 0.61560014, 0.71650872, 0.70406202, 0.71597532])

# PCA

In [138]:
pca = PCA(n_components=3)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)

In [139]:
pca = PCA(n_components=3)
x_train_pca2 = pca.fit_transform(x_train2)
x_test_pca2 = pca.fit_transform(x_test2)

In [85]:
gbr_pca= GradientBoostingRegressor()
gbr_pca.fit(x_train_pca, y_train)
gbr_pca.score(x_train_pca, y_train)

0.7590532270775023

In [76]:
cross_val_score(gbr_pca, x_test_pca, y_test)

array([0.68750301, 0.70388706, 0.61036695, 0.71219501, 0.66635662])

### finding optimal n_components for pca

In [89]:
# max_feats = [10,8,6]
# scores = []
# for num in max_feats:
#     gbr_pca= GradientBoostingRegressor(max_features=num)
#     gbr_pca.fit(x_train_pca, y_train)
#     scores.append((num, gbr_pca.score(x_train_pca, y_train)))
#     print(f'done with {num} number of max feats')
# scores

# Optimal 

In [240]:
gbr_pca = GradientBoostingRegressor(n_estimators=500,  min_samples_split=3, verbose=1)
gbr_pca.fit(x_train2, y_train2)
gbr_pca.score(x_test2, y_test2)

      Iter       Train Loss   Remaining Time 
         1 118959622480.1141           37.40s
         2 107953701231.5737           32.29s
         3 98448831066.4784           33.54s
         4 90844066065.6932           33.21s
         5 84217559224.3214           31.86s
         6 78662534075.9526           30.71s
         7 73557838899.4765           29.69s
         8 69200955883.0480           28.81s
         9 65605095437.9484           28.09s
        10 62535719723.9858           27.23s
        20 45070111006.2471           23.26s
        30 37923762892.0710           22.71s
        40 34295378844.6169           22.42s
        50 31989931097.7658           21.46s
        60 30594507519.0900           20.59s
        70 29386421008.2003           19.99s
        80 28459028416.0673           19.77s
        90 27680470965.1230           19.19s
       100 27063484060.3038           18.64s
       200 23295273339.1070           14.18s
       300 21093107176.1212           11.19s
       

0.767641101986647

# GridSearchCV

In [102]:
gbr_pca_opt= GradientBoostingRegressor(criterion='mse', n_estimators=10, verbose=1)
gbr_pca_opt.fit(x_train_pca, y_train)
gbr_pca_opt.score(x_train_pca, y_train)

      Iter       Train Loss   Remaining Time 
         1 122875370273.7433            2.06s
         2 110394404488.5574            1.46s
         3 99936012577.2876            1.10s
         4 91352467698.3215            0.85s
         5 84058091973.6071            0.67s
         6 77982836137.5191            0.52s
         7 72842630477.9844            0.37s
         8 68536118385.0893            0.24s
         9 64812915566.8446            0.12s
        10 61779931409.6912            0.00s


0.5509992720401198

In [103]:
params = {'learning_rate': [0.1, 0.2, 0.4],
          'min_samples_split': [2, 3, 5]}

In [104]:
gs = GridSearchCV(gbr_pca_opt, params)
gs.fit(x_train_pca, y_train)

      Iter       Train Loss   Remaining Time 
         1 124488207080.0117            1.56s
         2 111535586159.2763            1.63s
         3 100968999671.4159            1.17s
         4 92060216533.3797            0.89s
         5 84670429061.9953            0.67s
         6 78376623973.6192            0.50s
         7 72994691687.8049            0.35s
         8 68510889632.4280            0.23s
         9 64765323713.8118            0.11s
        10 61651309718.2557            0.00s
      Iter       Train Loss   Remaining Time 
         1 123485207226.6198            0.78s
         2 110721535798.5718            0.72s
         3 100094881122.3578            0.66s
         4 91464491532.6912            0.58s
         5 84260967077.7431            0.48s
         6 77979788259.0272            0.38s
         7 72896990153.7111            0.29s
         8 68689056404.8197            0.20s
         9 64871092826.4461            0.11s
        10 61507158621.1300            0.00s
  

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='mse', init=None,
                                                 learning_rate=0.1, loss='ls',
                                                 max_depth=3, max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=10,
                                                 n_iter_no_change=None,
                                                 presort='deprecated',
  

In [106]:
gs.best_estimator_

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='mse', init=None,
                          learning_rate=0.4, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=3,
                          min_weight_fraction_leaf=0.0, n_estimators=10,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=1, warm_start=False)

In [107]:
# grid search best_estimator_ output model (VERY OVERFIT)

# GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='mse', init=None,
#                           learning_rate=0.4, loss='ls', max_depth=3,
#                           max_features=None, max_leaf_nodes=None,
#                           min_impurity_decrease=0.0, min_impurity_split=None,
#                           min_samples_leaf=1, min_samples_split=3,
#                           min_weight_fraction_leaf=0.0, n_estimators=10,
#                           n_iter_no_change=None, presort='deprecated',
#                           random_state=None, subsample=1.0, tol=0.0001,
#                           validation_fraction=0.1, verbose=1, warm_start=False)

In [156]:
gbr_pca = GradientBoostingRegressor(n_estimators=500,  min_samples_split=3, verbose=1)
gbr_pca.fit(x_train2, y_train2)
gbr_pca.score(x_test2, y_test2)

      Iter       Train Loss   Remaining Time 
         1 125493800661.5281           22.97s
         2 113543622057.6961           26.29s
         3 103594050779.2841           26.14s
         4 95304720494.4557           24.34s
         5 88389149079.0906           23.05s
         6 82441451858.9953           22.41s
         7 76963870538.3639           21.85s
         8 72262634708.7123           21.24s
         9 68267160870.0110           20.68s
        10 64803015487.1105           20.23s
        20 46201309021.4866           17.95s
        30 38629264324.6173           17.56s
        40 34755949376.7570           17.16s
        50 32430723631.0676           16.57s
        60 30998711559.9802           16.35s
        70 29812066330.3973           15.83s
        80 28849694439.4422           15.37s
        90 28022442732.7863           15.17s
       100 27325185565.0321           14.79s
       200 23285518375.4471           11.21s
       300 21003408104.2001            7.51s
      

0.7635080861129032

In [122]:
gbr_pca.fit(x_train_pca2, y_train2)
gbr_pca.score(x_train_pca2, y_train2)

      Iter       Train Loss   Remaining Time 
         1 88708785196.1104            1.79m
         2 65295883566.9057            1.50m
         3 54949160123.2202            1.32m
         4 48851259209.9144            1.21m
         5 46060424240.8074            1.14m
         6 43982121904.4224            1.09m
         7 42408502389.2186            1.04m
         8 41226848788.1739            1.02m
         9 40306846834.7694            1.01m
        10 39823513586.2918           59.48s
        20 35822877815.1472           56.19s
        30 33176704060.1389           54.99s
        40 31023832239.6091           52.61s
        50 29863353124.1103           50.79s
        60 28235389785.2225           50.04s
        70 27269054186.7039           48.80s
        80 26394036596.4370           49.15s
        90 25542363316.2314           50.01s
       100 24731362543.0808           49.45s
       200 18804800794.5928           43.94s
       300 15602345655.7715           38.31s
       40

0.9557792502732567

In [123]:
rms2_train = sqrt(mean_squared_error(y_train2, gbr_pca.predict(x_train_pca2)))
rms2_train

78678.25434202689

In [125]:
gbr_pca.fit(x_train_pca2, y_train2)
gbr_pca.score(x_test_pca2, y_test2)

      Iter       Train Loss   Remaining Time 
         1 88708785196.1104            1.39m
         2 65295883566.9057            1.32m
         3 54949160123.2202            1.15m
         4 48851259209.9143            1.08m
         5 46060424240.8074            1.03m
         6 43982121904.4224            1.08m
         7 42408502389.2186            1.05m
         8 41226848788.1739            1.03m
         9 40306846834.7694            1.01m
        10 39823513586.2918           59.21s
        20 35822877815.1472           50.99s
        30 33176704060.1389           49.71s
        40 31023832239.6091           47.74s
        50 29863353124.1103           47.09s
        60 28235389785.2225           46.55s
        70 27269054186.7039           46.18s
        80 26394036596.4370           45.38s
        90 25542363316.2314           44.93s
       100 24731362543.0808           44.57s
       200 18804800794.5928           41.79s
       300 15602345655.7715           38.04s
       40

-2.7331904052099714

In [None]:
rms2_test = sqrt(mean_squared_error(y_test2, gbr_pca.predict(x_test_pca2)))
rms2_test

In [117]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_train, gbr_pca.predict(x_train_pca)))

In [118]:
rms

75402.01472962025

In [116]:
cross_val_score(gbr_pca, x_train_pca, y_train, scoring='neg_root_mean_squared_error')

      Iter       Train Loss   Remaining Time 
         1 87817564990.0472            1.34m
         2 65058102653.8049            1.51m
         3 53618933352.7120            1.47m
         4 48581286005.4258            1.31m
         5 45427103962.7582            1.19m
         6 43118847988.2225            1.10m
         7 41711629534.5741            1.05m
         8 40530411949.1172            1.01m
         9 39833776529.3797           58.87s
        10 39314293763.7881           57.33s
        20 35350118220.5889           50.69s
        30 32609109381.3944           48.91s
        40 29814877791.7152           49.62s
        50 27980268238.0792           50.51s
        60 26332951025.2575           48.83s
        70 25118813322.7857           47.35s
        80 24052066848.0023           46.39s
        90 22927199930.8862           45.68s
       100 22211426986.8578           44.86s
       200 16274323677.5728           38.65s
       300 13040128615.4458           34.46s
       40

array([-214381.03554666, -217225.42452155, -208960.95322306,
       -218479.51829197, -242828.33110526])

In [114]:
import sklearn


In [115]:
sorted(sklearn.metrics.SCORERS.keys())


['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']