In [613]:
import numpy as np
import pandas as pd
import os

np.random.seed(165)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

#path="C:\\Users\jsciamma\Documents\Patrick\ML\Kaggle\HousePrices"
path="C:\\Users\\Patrick\ML\HousePrices"
os.chdir(path)
os.getcwd()

train = pd.read_csv("data\\train.csv")
test = pd.read_csv("data\\test.csv")

In [614]:
traintrain = train.iloc[0:1168, ]
traintest = train.iloc[1168:, ]

In [648]:
train.reset_index(inplace=True)
test.reset_index(inplace=True)
test.head()
train.set_index('Id', inplace=True)
test.set_index('Id', inplace=True)
train.loc[train.index[0:1168],'Set']='traintrain'
train.loc[train.index[1168:],'Set']='traintest'
test['Set']='test'


## count variables

In [644]:
def correct_count_variables():
    count_variables = ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 
                       'GarageCars', 'BedroomAbvGr','KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces']
    full.loc[full.BsmtFullBath.isnull(), 'BsmtFullBath'] = 0
    full.loc[full.BsmtHalfBath.isnull(), 'BsmtHalfBath'] = 0
    full.loc[full.GarageCars.isnull(), 'GarageCars'] = 0
    return count_variables

## convert some categories to main distinguishing feature

In [667]:
def convert_to_binary():
    full['Bin_SaleCondition_Partial'] = (full.SaleCondition == 'Partial').astype(int)
    full['Bin_SaleType_New'] = (full.SaleType == 'New').astype(int)
    full['Bin_Condition_Artery'] = ((full.Condition1 == 'Artery') | (full.Condition2 == 'Artery')).astype(int)
    full['Bin_Functional_Typ'] = (full.Functional == 'Typ').astype(int)
    full['Bin_CentralAir_Y'] = (full.CentralAir == 'Y').astype(int)
    full['Bin_Electrical'] = (full.Electrical == 'SBrkr').astype(int)
    full['Bin_Heating'] = (full.Heating == 'GasA').astype(int)
    full['Bin_LandContour'] = (full.LandContour == 'Bnk').astype(int)
    return ['SaleCondition', 'SaleType', 'Condition1', 'Condition2', 'Functional', 'CentralAir',
           'Electrical', 'Heating', 'LandContour'], list(full.filter(regex='Bin_').columns)

## convert surface to ratios (as we are solving for price per sq ft

In [670]:
def convert_surfaces():
    surfaces = (list(full.filter(regex='SF').columns) + 
                ['LotArea','MasVnrArea','GarageArea','EnclosedPorch',
                 '3SsnPorch','ScreenPorch','PoolArea', 'LotFrontage'])
    # replace NAs with 0s where correct
    for c in surfaces:
        full.loc[np.isnan(full[c]), c] = 0
    # create surface ratio to GrLiveArea
    for c in surfaces:
        full["prop_" + c] = full[c] / full.GrLivArea
    return surfaces, ["prop_" + c for c in surfaces]

## convert quality indicators

In [663]:
# convert quality indicators to a linear scale
def convert_quality():
    for c in ['Street', 'GarageQual', 'GarageCond', 'LotShape', 'LandSlope', 'ExterQual', 
              'ExterCond', 'BsmtCond', 'BsmtQual', 'HeatingQC', 'KitchenQual', 
              'FireplaceQu', 'GarageFinish', 'BsmtFinType1', 'BsmtFinType2']:
        full.loc[full[c].isnull(), c] = 'NA'
    cleanup_nums = {
        "GarageQual":     {"NA" : 0, "Po": 1, "Fa": 2, "TA": 3, "Gd" : 4, "Ex" : 5},
        "GarageCond":     {"NA" : 0, "Po": 1, "Fa": 2, "TA": 3, "Gd" : 4, "Ex" : 5},
        "LotShape":     {"Reg" : 0, "IR1": 1, "IR2": 2, "IR3": 3},
        "LandSlope":     {"Gtl" : 0, "Mod": 1, "Sev": 2},
        "ExterQual":     {"NA" : 0, "Po": 1, "Fa": 2, "TA": 3, "Gd" : 4, "Ex" : 5},
        "ExterCond":     {"NA" : 0, "Po": 1, "Fa": 2, "TA": 3, "Gd" : 4, "Ex" : 5},
        "BsmtCond":     {"NA" : 0, "Po": 1, "Fa": 2, "TA": 3, "Gd" : 4, "Ex" : 5},
        "BsmtQual":     {"NA" : 0, "Po": 1, "Fa": 2, "TA": 3, "Gd" : 4, "Ex" : 5},
        "HeatingQC":     {"Po": 1, "Fa": 2, "TA": 3, "Gd" : 4, "Ex" : 5},
        "KitchenQual":     {"NA" : 0,"Po": 1, "Fa": 2, "TA": 3, "Gd" : 4, "Ex" : 5},
        "FireplaceQu":     {"NA" : 0,"Po": 1, "Fa": 2, "TA": 3, "Gd" : 4, "Ex" : 5},
        "GarageFinish":     {"NA" : 0,"Unf": 1, "RFn": 2, "Fin": 3},
        "BsmtFinType1":     {"NA" : 0,"Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ":5, "GLQ":6},
        "BsmtFinType2":     {"NA" : 0,"Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ":5, "GLQ":6},
    }
    full.replace(cleanup_nums, inplace=True)
    return ['GarageQual', 'GarageCond', 'LotShape', 'LandSlope', 'ExterQual', 'ExterCond', 
            'BsmtCond', 'BsmtQual', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageFinish',
            'BsmtFinType1', 'BsmtFinType2', 'OverallCond', 'OverallQual']
        


## put it all together

In [671]:
full = pd.concat([train, test], )
count_variables = correct_count_variables()
drop_binary_variables, binary_variables = convert_to_binary()
drop_surface_variables, surface_variables = convert_surfaces()
quality_variables = convert_quality()

In [657]:
def print_info():
    for c in [count_variables, binary_variables, surface_variables, quality_variables]:
        print(full[c].info())
#print_info()

In [672]:
newfields = count_variables + quality_variables + binary_variables + surface_variables
oldfields = drop_binary_variables + drop_surface_variables
full.drop(newfields + oldfields, axis=1).info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 1 to 2919
Data columns (total 31 columns):
Alley           198 non-null object
BldgType        2919 non-null object
BsmtExposure    2837 non-null object
Exterior1st     2918 non-null object
Exterior2nd     2918 non-null object
Fence           571 non-null object
Foundation      2919 non-null object
GarageType      2762 non-null object
GarageYrBlt     2760 non-null float64
GrLivArea       2919 non-null int64
HouseStyle      2919 non-null object
LotConfig       2919 non-null object
MSSubClass      2919 non-null int64
MSZoning        2915 non-null object
MasVnrType      2895 non-null object
MiscFeature     105 non-null object
MiscVal         2919 non-null int64
MoSold          2919 non-null int64
Neighborhood    2919 non-null object
PavedDrive      2919 non-null object
PoolQC          10 non-null object
RoofMatl        2919 non-null object
RoofStyle       2919 non-null object
SalePrice       1460 non-null float64
Set         

In [50]:
# quick tests
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error


Xt = full.loc[full['Set']=='traintrain', ['MeanSalePricePerSqft', 'GrLivArea']]
Xt[]

Xt = Xt['MeanSalePricePerSqft']
yt = full3.loc[full3['Set']=='traintrain', ['SalePricePerSqft']]
lr = LinearRegression()
lr.fit(Xt.values.reshape(-1, 1), yt)
print(lr.score(Xt.values.reshape(-1, 1),yt))
print(np.sqrt(mean_squared_log_error(yt, lr.predict(Xt.values.reshape(-1, 1)))))
lr.coef_


SyntaxError: invalid syntax (<ipython-input-50-330af818d036>, line 8)

In [8]:
print(t.GarageQual.describe())
print(full2.GarageQual.describe())

count    2919.000000
mean        2.800959
std         0.715863
min         0.000000
25%         3.000000
50%         3.000000
75%         3.000000
max         5.000000
Name: GarageQual, dtype: float64
count    2919.000000
mean        2.800959
std         0.715863
min         0.000000
25%         3.000000
50%         3.000000
75%         3.000000
max         5.000000
Name: GarageQual, dtype: float64


In [106]:
list(cleanup_nums.keys())

['GarageQual',
 'GarageCond',
 'LotShape',
 'LandSlope',
 'ExterQual',
 'ExterCond',
 'BsmtCond',
 'BsmtQual',
 'HeatingQC',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish']

In [None]:
def missing_values_table(df): 
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum()/len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        return mis_val_table_ren_columns 

missing_values_table(full)

In [None]:
filter_col = [col for col in full if col.startswith('Bsmt')]

#full.loc[full.TotalBsmtSF ==0, filter_col].info()
full.Functional.value_counts()
import scipy.interpolate as sp
import pylab

fl = sp.interp1d(full[, y,kind='linear')



In [None]:
t_nonulls = traintrain.loc[:,traintrain.isnull().sum() == 0]
traintrain.YrSold.hist()

In [None]:
def create_dummies(x):
    y = x.drop('Neighborhood', axis=1).dropna(axis=1).loc[:, x.dtypes == object]
    return pd.get_dummies(y, columns=y.columns, drop_first=True)

def select_nonobjects(x):
    return x.dropna(axis=1).loc[:, x.dtypes != object]

def add_neighborhood_mean(x):
    return pd.DataFrame(index = x.index, data = x.groupby('Neighborhood').SalePrice.transform(np.mean)
                       ).rename(columns={'SalePrice':'Neighborhoodmean'})

def transform_data(x):
    return pd.concat([create_dummies(x),select_nonobjects(x), add_neighborhood_mean(x)], axis=1)

#x1 = create_dummies(traintrain)
#x2 = select_nonobjects(traintrain)
#x3 = add_neighborhood_mean(traintrain)

traintrain_transform = transform_data(traintrain)


In [None]:
traintrain_transform.info()

In [None]:
#create_dummies(traintrain.drop('Neighborhood', axis=1)).corrwith(traintrain.SalePrice).sort_values(ascending=False)
t = traintrain_transform.corrwith(traintrain_transform.SalePrice).sort_values(ascending=False)
fields = t[abs(t) > 0.25].index
fields
def filter_data(traintrain_transform, fields):
    return traintrain_transform.loc[:, fields].drop('SalePrice', axis=1)

traintrain_transform_filter = filter_data(traintrain_transform, fields)
traintrain_transform_filter.info()

In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

lin_reg = Pipeline([
    ('std_scaler', StandardScaler()),
    ('lin_reg', LinearRegression()),
])

lin_reg.fit(traintrain_transform_filter, np.log(traintrain.SalePrice))
lin_reg.score(traintrain_transform_filter,np.log(traintrain.SalePrice))

## add variables

In [543]:
def create_cross_variables(x, y):
    full[x + '_x_' + y] = full[x] * full[y]
create_cross_variables('prop_TotalBsmtSF', 'BsmtQual')
create_cross_variables('prop_TotalBsmtSF', 'BsmtCond')
create_cross_variables('prop_LotArea', 'LotShape')
create_cross_variables('prop_LotArea', 'LandSlope')
create_cross_variables('prop_BsmtFinSF1', 'BsmtFinType1')
create_cross_variables('prop_BsmtFinSF2', 'BsmtFinType2')
create_cross_variables('prop_GarageArea', 'GarageQual')
create_cross_variables('prop_GarageArea', 'GarageCond')

cross_variables = full.filter(regex='_x_').columns
full[cross_variables].info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 1 to 2919
Data columns (total 8 columns):
prop_TotalBsmtSF_x_BsmtQual       2919 non-null float64
prop_TotalBsmtSF_x_BsmtCond       2919 non-null float64
prop_LotArea_x_LotShape           2919 non-null float64
prop_LotArea_x_LandSlope          2919 non-null float64
prop_BsmtFinSF1_x_BsmtFinType1    2919 non-null float64
prop_BsmtFinSF2_x_BsmtFinType2    2919 non-null float64
prop_GarageArea_x_GarageQual      2919 non-null float64
prop_GarageArea_x_GarageCond      2919 non-null float64
dtypes: float64(8)
memory usage: 205.2 KB


## set up fields and training set

In [610]:
#fields = ['lGrLivArea','MeanlSalePrice', 'OverallQual', 'OverallCond', 'TotalBsmtSF', 'GarageArea', 'LotArea', 
#          'YearBuilt', 'YearRemodAdd', 'MiscVal', 'LotAreaxLotShape', 'lGrLivAreaxMeanlSalePrice', 'LotFrontage',
#         'LotAreaxLandSlope', 'lGrLivAreaxOverallQual', 'lGrLivAreaxOverallQual'] + list(cleanup_nums.keys())
fields = (['OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd'] + 
#          list(count_variables) +
          list(grouped_variables) + list(binary_variables) + 
          list(prop_variables) + list(cross_variables) +
          list(cleanup_nums.keys()))
#fields
# define training set
X = full.loc[full.Set=='traintrain', fields]
y = full.loc[full.Set=='traintrain', 'SalePricePerSqft']

In [611]:
fields

['OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'Mean_SalePricePerSqft_Neighborhood',
 'Mean_SalePricePerSqft_MSSubClass',
 'Mean_SalePricePerSqft_MSZoning',
 'Mean_SalePricePerSqft_BldgType',
 'Mean_SalePricePerSqft_HouseStyle',
 'Bin_SaleCondition_Partial',
 'Bin_SaleType_New',
 'Bin_Condition_Artery',
 'Bin_Functional_Typ',
 'Bin_CentralAir_Y',
 'Bin_Electrical',
 'Bin_Heating',
 'Bin_LandContour',
 'prop_1stFlrSF',
 'prop_2ndFlrSF',
 'prop_BsmtFinSF1',
 'prop_BsmtFinSF2',
 'prop_BsmtUnfSF',
 'prop_LowQualFinSF',
 'prop_OpenPorchSF',
 'prop_TotalBsmtSF',
 'prop_WoodDeckSF',
 'prop_LotArea',
 'prop_MasVnrArea',
 'prop_GarageArea',
 'prop_EnclosedPorch',
 'prop_3SsnPorch',
 'prop_ScreenPorch',
 'prop_PoolArea',
 'prop_TotalBsmtSF_x_BsmtQual',
 'prop_TotalBsmtSF_x_BsmtCond',
 'prop_LotArea_x_LotShape',
 'prop_LotArea_x_LandSlope',
 'prop_BsmtFinSF1_x_BsmtFinType1',
 'prop_BsmtFinSF2_x_BsmtFinType2',
 'prop_GarageArea_x_GarageQual',
 'prop_GarageArea_x_GarageCond',
 'Gar

## linear regression

In [609]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error



simple_lin_reg=LinearRegression()
simple_lin_reg.fit(X, y)
print("score on training set", simple_lin_reg.score(X, y))
print("mean squared error on training set", np.sqrt(mean_squared_log_error(y, simple_lin_reg.predict(X))))
mean_squared_error
a = np.sqrt(-cross_val_score(simple_lin_reg, X, y, cv=10, scoring='neg_mean_squared_log_error'))
print("cross val score", a)
print("mean cross val score", a.mean())

score on training set 0.787184680094
mean squared error on training set 0.130423076782
cross val score [ 0.16184329  0.12692411  0.1097008   0.12850595  0.19382135  0.15940695
  0.15397156  0.11845307  0.13285687  0.11842078]
mean cross val score 0.140390472218


## Helper functions

In [401]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

def training_diagnostics(model):
    print("score on training set", model.score(X, y))
    print("mean squared error on training set", np.sqrt(mean_squared_log_error(y, model.predict(X))))

def cv_diagnostics(model):
    a = np.sqrt(-cross_val_score(model, X, y, cv=10, scoring="neg_mean_squared_log_error"))
    print("cross val score", a)
    print("mean cross val score", a.mean())

## Ridge Regression

In [603]:
from sklearn.linear_model import Ridge

ridge_reg = Pipeline([
    ('std_scaler', StandardScaler()),
    ('ridge_reg', Ridge()),
])
ridge_reg.set_params(ridge_reg__alpha=50, ridge_reg__solver="cholesky").fit(X, y)
training_diagnostics(ridge_reg)


score on training set 0.783623848455
mean squared error on training set 0.130853966648


In [604]:
cv_diagnostics(ridge_reg)

cross val score [ 0.14502201  0.12682763  0.11180431  0.13007685  0.19168705  0.15407511
  0.15116939  0.11729305  0.13325347  0.11770263]
mean cross val score 0.137891150188


In [575]:
# cross validated grid search
parameters = {'ridge_reg__alpha':[1, 10, 20, 50, 75, 100,125],
             }
ridge_reg_CV = GridSearchCV(ridge_reg, parameters, scoring = 'neg_mean_squared_log_error').fit(X, y)
print(ridge_reg_CV.best_estimator_)
print(np.sqrt(-ridge_reg_CV.best_score_))


Pipeline(memory=None,
     steps=[('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ridge_reg', Ridge(alpha=75, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='cholesky', tol=0.001))])
0.140402267903


## Elastic Net

In [408]:
from sklearn.linear_model import ElasticNet

elastic_net = Pipeline([
    ('std_scaler', StandardScaler()),
    ('elastic_net', ElasticNet()),
])
elastic_net.set_params(elastic_net__alpha=0.01, elastic_net__l1_ratio=0.25).fit(X, y)
training_diagnostics(elastic_net)

score on training set 0.761101446746
mean squared error on training set 0.137822383032


In [409]:
cv_diagnostics(elastic_net)

cross val score [ 0.15108381  0.13420401  0.13135774  0.14030226  0.19709333  0.15807656
  0.15223331  0.12141879  0.14038128  0.1201753 ]
mean cross val score 0.144632638877


In [410]:
# cross validated grid search
parameters = {'elastic_net__alpha':[0.01, 0.01, 0.1, 1, 10],'elastic_net__l1_ratio':[0.01, 0.25, 0.5, 0.75, 1]
             }
elastic_net_CV = GridSearchCV(elastic_net, parameters, scoring = 'neg_mean_squared_log_error').fit(X, y)
print(elastic_net_CV.best_estimator_)
print(np.sqrt(-elastic_net_CV.best_score_))


Pipeline(memory=None,
     steps=[('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('elastic_net', ElasticNet(alpha=1, copy_X=True, fit_intercept=True, l1_ratio=1,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False))])
0.146120578308


## Random Forest

In [411]:
from sklearn.ensemble import RandomForestRegressor

In [605]:
#rf = Pipeline([
#    ('std_scaler', StandardScaler()),
#    ('rf', RandomForestRegressor()),
#])
rf = RandomForestRegressor()
#rf.set_params(rf__n_estimators=100, rf__max_features='sqrt', rf__oob_score =True).fit(X, y)
rf.set_params(n_estimators=100, max_features='sqrt').fit(X, y)
training_diagnostics(rf)
#print(rf.oob_score_)

score on training set 0.964322388534
mean squared error on training set 0.0579900941113


In [606]:
#rf.feature_importances_
sorted(list(zip(X.columns, rf.feature_importances_)), key=lambda tup: tup[1], reverse=True)

[('prop_TotalBsmtSF_x_BsmtQual', 0.11529574224454894),
 ('prop_TotalBsmtSF_x_BsmtCond', 0.072028554659931701),
 ('Mean_SalePricePerSqft_Neighborhood', 0.071452165127590825),
 ('prop_BsmtFinSF1_x_BsmtFinType1', 0.059630537370720028),
 ('prop_TotalBsmtSF', 0.052892485873976484),
 ('YearBuilt', 0.052884493608616516),
 ('Mean_SalePricePerSqft_MSSubClass', 0.050631543988708019),
 ('YearRemodAdd', 0.040281226556279949),
 ('prop_GarageArea_x_GarageQual', 0.040249781674735051),
 ('OverallQual', 0.039401143098688494),
 ('prop_GarageArea_x_GarageCond', 0.035995640424338296),
 ('prop_BsmtFinSF1', 0.034049798859287959),
 ('prop_GarageArea', 0.030168747765077185),
 ('OverallCond', 0.023631153861835549),
 ('ExterQual', 0.023180441742942801),
 ('prop_LotArea', 0.022883710526856085),
 ('BsmtQual', 0.019168686327888766),
 ('prop_BsmtUnfSF', 0.017066065739204772),
 ('BsmtFinType1', 0.017063931135734917),
 ('prop_1stFlrSF', 0.016436973625834853),
 ('KitchenQual', 0.013211313713505533),
 ('GarageFinish', 

In [595]:
cv_diagnostics(rf)

cross val score [ 0.16408308  0.13514993  0.13650467  0.16245449  0.19228975  0.17470997
  0.15883838  0.14557962  0.15619055  0.15264292]
mean cross val score 0.157844336014


In [414]:
# cross validated grid search
parameters = {'max_features':[0.1, 0.25, 0.5, 0.75],
              'max_depth':[2, 3, 10, 20]
             }
rf_CV = GridSearchCV(rf, parameters, scoring = 'neg_mean_squared_log_error').fit(X, y)
print(rf_CV.best_estimator_)
print(np.sqrt(-rf_CV.best_score_))


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
0.141912477685


## SVM

In [556]:
from sklearn.svm import LinearSVR
svm_reg = Pipeline([
    ('std_scaler', StandardScaler()),
    ('svm_reg', LinearSVR()),
])
# use epsilon=0.11, C=.7 from grid search below
svm_reg.set_params(svm_reg__epsilon=0.01, svm_reg__C=.8).fit(X, y)


Pipeline(memory=None,
     steps=[('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm_reg', LinearSVR(C=0.8, dual=True, epsilon=0.01, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0))])

In [557]:
print("score on training set", svm_reg.score(X, y))
print("mean squared error on training set", np.sqrt(mean_squared_error(y, svm_reg.predict(X))))
a = np.sqrt(-cross_val_score(svm_reg, X, y, cv=10, scoring="neg_mean_squared_log_error"))
print("cross val score", a)
print("mean cross val score", a.mean())


score on training set 0.766531793676
mean squared error on training set 14.9270352428
cross val score [ 0.15086372  0.13277998  0.10735298  0.13956555  0.19596882  0.15754123
  0.14917854  0.11951871  0.13192685  0.12002258]
mean cross val score 0.14047189678


## svm, cross validated grid search

In [554]:
from sklearn.model_selection import GridSearchCV
parameters = {'svm_reg__epsilon':[0.009, 0.1, 0.11],
             'svm_reg__C':[0.5, 0.6, 0.7, 0.8]}
svm_reg_CV = GridSearchCV(svm_reg, parameters, scoring = 'neg_mean_squared_log_error')
svm_reg_CV.fit(X, y)


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm_reg', LinearSVR(C=0.5, dual=True, epsilon=0.09, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svm_reg__epsilon': [0.009, 0.1, 0.11], 'svm_reg__C': [0.5, 0.6, 0.7, 0.8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_log_error', verbose=0)

In [555]:
print(sorted(svm_reg_CV.cv_results_.keys()))
print(svm_reg_CV.best_estimator_)
print(np.sqrt(-svm_reg_CV.best_score_))

['mean_fit_time', 'mean_score_time', 'mean_test_score', 'mean_train_score', 'param_svm_reg__C', 'param_svm_reg__epsilon', 'params', 'rank_test_score', 'split0_test_score', 'split0_train_score', 'split1_test_score', 'split1_train_score', 'split2_test_score', 'split2_train_score', 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score']
Pipeline(memory=None,
     steps=[('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm_reg', LinearSVR(C=0.8, dual=True, epsilon=0.1, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0))])
0.147524685933


## SVM poly reg

In [None]:
from sklearn.svm import SVR
svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
svm_poly_reg = Pipeline([
    ('std_scaler', StandardScaler()),
    ('svm_poly_reg', SVR()),
])

svm_poly_reg.set_params(
    svm_poly_reg__kernel="poly", svm_poly_reg__degree=2, svm_poly_reg__C=0.7, svm_poly_reg__epsilon=0.11).fit(X, y)


In [None]:
print("score on training set", svm_poly_reg.score(X, y))
print("mean squared error on training set", np.sqrt(mean_squared_error(y, svm_poly_reg.predict(X))))

In [None]:
full2.loc[:, fields + list(cleanup_nums.keys())].info()

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_reg = RandomForestRegressor(n_estimators=200)
RF_reg.fit(X, y)
print("score on training set", RF_reg.score(X, y))

a = np.sqrt(-cross_val_score(RF_reg, X, y, cv=10, scoring="neg_mean_squared_error"))
print(a)
print(a.mean())

In [None]:
?lin_reg.score

In [None]:
np.sqrt(-cross_val_score(lin_reg, traintrain_transform_filter, np.log(traintrain.SalePrice), cv=10, scoring="neg_mean_squared_error"))


In [None]:
from sklearn.ensemble import RandomForestRegressor

RF_reg = RandomForestRegressor(n_estimators=100)
RF_reg.fit(traintrain_transform_filter, np.log(traintrain.SalePrice))
RF_reg.score(traintrain_transform_filter,np.log(traintrain.SalePrice))


In [None]:
def cross_val(model, data, price):
    return np.sqrt(-cross_val_score(model, data, np.log(price), cv=10, scoring="neg_mean_squared_error"))

cross_val(RF_reg, traintrain_transform_filter, traintrain.SalePrice)

## Test

In [None]:
X = full2.loc[full2.Set=='traintest', fields]
y = full2.loc[full2.Set=='traintest', 'lSalePrice']
predicted = simple_lin_reg.predict(X)
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y, predicted))

In [None]:
d = pd.DataFrame((traintrain.groupby('Neighborhood')
                  .SalePrice.agg(np.mean))
                 .reset_index()
                 .rename(columns={'SalePrice': 'Neighborhoodmean'})
                )
traintest_transform = pd.merge(traintest, d, on='Neighborhood', how='left')
traintest_transform = pd.concat([create_dummies(traintest_transform),select_nonobjects(traintest_transform)], axis=1)
traintest_transform_filter = filter_data(traintest_transform, fields)
traintest_transform_filter.Electrical_SBrkr = 0
traintest_transform_filter.info()
#traintest_transform_filter
#traintrain.Neighborhood
#print(traintest.loc[:, ['Id','Neighborhood']])
#print(d)
#print(traintest_transform.loc[0:10, ['Id','Neighborhood', 'Neighborhoodmean']])
#pd.merge(traintest.loc[:, ['Id','Neighborhood']], d, on='Neighborhood', how='left')
#traintest.loc[traintest.index[0:10], ['Id','Neighborhood']]

In [None]:
list(zip(traintrain_transform_filter.columns, traintest_transform_filter.columns))

In [None]:
lin_reg.score(traintest_transform_filter,np.log(traintest.SalePrice))

In [None]:
traintest_transform_filter.corrwith(other=traintest.SalePrice)

In [None]:
predicted = lin_reg.predict(traintest_transform_filter)

In [None]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(np.log(traintest.SalePrice), predicted))

In [None]:
predicted_RF = RF_reg.predict(traintest_transform_filter)
np.sqrt(mean_squared_error(np.log(traintest.SalePrice), predicted_RF))

## Submission

In [None]:
traintest2['Neighborhoodmean'].describe()

In [None]:
# re-create full training set
full = pd.concat([traintrain, traintest2])
# re-fit on full set
final_lin_reg = LinearRegression()
final_lin_reg.fit(full[fields], np.log(full.SalePrice))
final_lin_reg.score(full[fields],np.log(full.SalePrice))

In [None]:
# add neighborhood means
test2 = pd.merge(test, d, on='Neighborhood')

In [None]:
test2.groupby('Neighborhood').mean().loc[:, 'Neighborhoodmean']

In [None]:
test2[fields].info()

In [None]:
test2predicted = np.exp(lin_reg.predict(test2[fields]))

In [None]:
df = pd.DataFrame(index=test2.Id, data=test2predicted, columns=['SalePrice'])

In [None]:
df

## Simple submission

In [None]:
# 

In [159]:
# re-fit on full set
X = full2.loc[(full2.Set=='traintrain') | (full2.Set=='traintest'), fields]
y = full2.loc[(full2.Set=='traintrain') | (full2.Set=='traintest'),  'lSalePrice']
#simple_lin_reg=LinearRegression()
#simple_lin_reg.fit(X, y)
#svm_reg.fit(X, y)
ridge_reg.fit(X, y)
predicted = np.exp(ridge_reg.predict(full2.loc[(full2.Set=='test'), fields]))

In [160]:
full2.loc[(full2.Set=='test'), ].index
df = pd.DataFrame(index=full2.loc[(full2.Set=='test'), ].index, data=predicted, columns=['SalePrice'])

In [161]:
if True:
    df.to_csv("output\\result_20171122_1533.csv")