In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# transform year-something to how many years have elapsed
data['garageAge'] = data['GarageYrBlt'].map(lambda x: min(60,2018-x))
data['remodelAge'] = data['YearRemodAdd'].map(lambda x: min(100,2018-x))
data['BldAge'] = data['YearBuilt'].map(lambda x: min(120,2018-x))
# transform neighborhoods to ordinal, with the average of each neighborhood as the value
neighborshoods = (data.groupby('Neighborhood').mean()['SalePrice']).to_dict()

# test:
# transform year-something to how many years have elapsed
test['garageAge'] = test['GarageYrBlt'].map(lambda x: min(60,2018-x))
test['remodelAge'] = test['YearRemodAdd'].map(lambda x: min(100,2018-x))
test['BldAge'] = test['YearBuilt'].map(lambda x: min(120,2018-x))
# 150 only appears in the test set (once), arbitrarily change to something so that the code runs
test.loc[test['MSSubClass']==150,'MSSubClass'] = 20
# one NaN value to fix
test.loc[660,'BsmtFinSF1'] = 0

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [5]:
from feature_tester import FeatureTester, observe_feature, get_dict, get_list

After having tried different things, I decided to systematically add features one by one, each time checking if they add (at least 0.001) and whether other features, after the addition, are no longer contributing (because of co-depenedence)

In [6]:
ft = FeatureTester(data,random_seed=22,precision=3)
ft.set_y('SalePrice')
ft.add_estimator(LinearRegression(),'Linear')
ft.add_estimator(RandomForestRegressor(10),'Forest')
ft.add_estimator(AdaBoostRegressor(),'Ada   ')
ft.add_estimator(GradientBoostingRegressor(),'GBoost')

ft.add_feature('OverallQual') 
ft.add_feature('BsmtFinSF1')
ft.add_feature('GrLivArea')
ft.add_feature('MSSubClass','categorical')
ft.add_feature('ExterQual','categorical')
ft.add_feature('LandContour','ordinal',{'Lvl': 1, 'Bnk': 0, 'Low': 2, 'HLS': 3})
ft.add_feature('BsmtExposure','ordinal',{'No': 0, 'Gd': 3, 'Mn': 1, 'Av': 2,'nan':0})
ft.add_feature('FireplaceQu','ordinal',{'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0,'nan':0})
ft.add_feature('remodelAge') 
ft.add_feature('BldAge')
ft.add_feature('Neighborhood','ordinal',neighborshoods)

ft.fit()

array([['Linear', '0.809'],
       ['Forest', '0.872'],
       ['Ada   ', '0.826'],
       ['GBoost', '0.893']], dtype='<U32')

In [7]:
ft.score_all_features()

OverallQual
[['Linear' '0.015']
 ['Forest' '-0.016']
 ['Ada   ' '0.017']
 ['GBoost' '0.011']]
BsmtFinSF1
[['Linear' '-0.004']
 ['Forest' '0.008']
 ['Ada   ' '0.024']
 ['GBoost' '0.017']]
GrLivArea
[['Linear' '0.029']
 ['Forest' '0.044']
 ['Ada   ' '0.065']
 ['GBoost' '0.052']]
MSSubClass
[['Linear' '0.012']
 ['Forest' '-0.013']
 ['Ada   ' '-0.002']
 ['GBoost' '0.004']]
ExterQual
[['Linear' '0.008']
 ['Forest' '0.003']
 ['Ada   ' '0.0']
 ['GBoost' '0.005']]
LandContour
[['Linear' '0.0']
 ['Forest' '-0.006']
 ['Ada   ' '0.0']
 ['GBoost' '0.003']]
BsmtExposure
[['Linear' '0.007']
 ['Forest' '0.002']
 ['Ada   ' '-0.004']
 ['GBoost' '0.005']]
FireplaceQu
[['Linear' '0.002']
 ['Forest' '-0.012']
 ['Ada   ' '0.004']
 ['GBoost' '0.006']]
remodelAge
[['Linear' '0.001']
 ['Forest' '0.006']
 ['Ada   ' '0.002']
 ['GBoost' '0.012']]
BldAge
[['Linear' '0.0']
 ['Forest' '-0.003']
 ['Ada   ' '-0.003']
 ['GBoost' '0.011']]
Neighborhood
[['Linear' '0.029']
 ['Forest' '0.02']
 ['Ada   ' '0.013']
 ['GBoos

In [7]:
ft.score_test_set()

array([['Linear', '0.86'],
       ['Forest', '0.882'],
       ['Ada   ', '0.794'],
       ['GBoost', '0.902']], dtype='<U32')

In [8]:
pred = np.array(ft.predict(test))

In [9]:
pred.shape

(4, 1459)

In [10]:
pred.mean(axis=0).shape

(1459,)

In [11]:
test.index

RangeIndex(start=0, stop=1459, step=1)

In [17]:
test['SalePrice'] = pred.mean(axis=0)
test.set_index('Id').head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,garageAge,remodelAge,BldAge,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,,0,6,2010,WD,Normal,57.0,57,57,120258.498054
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,Gar2,12500,6,2010,WD,Normal,60.0,60,60,142396.173525
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,,0,3,2010,WD,Normal,21.0,20,21,189182.221319
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,,0,6,2010,WD,Normal,20.0,20,20,190457.904352
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,,0,1,2010,WD,Normal,26.0,26,26,213328.490139


In [19]:
test[['Id','SalePrice']].set_index('Id').to_csv('predictions.csv')

In [24]:
pred[0]

array([113351.80801632, 154611.96805396, 178457.78772965, ...,
       160638.35629915, 131216.06599873, 222903.63349892])

In [25]:
# try Gboost only
test['SalePrice'] = pred[3]
test[['Id','SalePrice']].set_index('Id').to_csv('gboost.csv')