In [251]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
import timeit
import math

In [252]:
data = pd.read_csv("train.csv", header=0) 

In [253]:
data.isnull().sum()/data.shape[0]

Id               0.000000
MSSubClass       0.000000
MSZoning         0.000000
LotFrontage      0.177397
LotArea          0.000000
Street           0.000000
Alley            0.937671
LotShape         0.000000
LandContour      0.000000
Utilities        0.000000
LotConfig        0.000000
LandSlope        0.000000
Neighborhood     0.000000
Condition1       0.000000
Condition2       0.000000
BldgType         0.000000
HouseStyle       0.000000
OverallQual      0.000000
OverallCond      0.000000
YearBuilt        0.000000
YearRemodAdd     0.000000
RoofStyle        0.000000
RoofMatl         0.000000
Exterior1st      0.000000
Exterior2nd      0.000000
MasVnrType       0.005479
MasVnrArea       0.005479
ExterQual        0.000000
ExterCond        0.000000
Foundation       0.000000
                   ...   
BedroomAbvGr     0.000000
KitchenAbvGr     0.000000
KitchenQual      0.000000
TotRmsAbvGrd     0.000000
Functional       0.000000
Fireplaces       0.000000
FireplaceQu      0.472603
GarageType  

In [254]:
data = data.drop(['Alley', 'MiscFeature', 'Fence', 'PoolQC'], axis=1)

In [255]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [256]:
# Replaces categorical value in Quality columns with numerical scale
qualityCols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
              'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

for col in qualityCols:
    data[col] = data[col].map({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po':1, 'NA': 0})

In [257]:
catCols = set(list(data))-set(list(data._get_numeric_data()))

# #TRY dropping all cat cols
# data = data.drop(columns=catCols)

In [258]:
#Perform one hot encoding on all categorical columns
frames = []
salePrice = data['SalePrice']
for col in catCols:
    oneHot_encoded = pd.get_dummies(data[col])
    oneHot_encoded = oneHot_encoded.add_prefix(col + '_is')
    frames.append(oneHot_encoded)
frames.append(salePrice)

data = data.drop(catCols, axis=1)
data = pd.concat(frames, axis=1)

In [259]:
list(data)

['Foundation_isBrkTil',
 'Foundation_isCBlock',
 'Foundation_isPConc',
 'Foundation_isSlab',
 'Foundation_isStone',
 'Foundation_isWood',
 'CentralAir_isN',
 'CentralAir_isY',
 'LotShape_isIR1',
 'LotShape_isIR2',
 'LotShape_isIR3',
 'LotShape_isReg',
 'MSZoning_isC (all)',
 'MSZoning_isFV',
 'MSZoning_isRH',
 'MSZoning_isRL',
 'MSZoning_isRM',
 'Neighborhood_isBlmngtn',
 'Neighborhood_isBlueste',
 'Neighborhood_isBrDale',
 'Neighborhood_isBrkSide',
 'Neighborhood_isClearCr',
 'Neighborhood_isCollgCr',
 'Neighborhood_isCrawfor',
 'Neighborhood_isEdwards',
 'Neighborhood_isGilbert',
 'Neighborhood_isIDOTRR',
 'Neighborhood_isMeadowV',
 'Neighborhood_isMitchel',
 'Neighborhood_isNAmes',
 'Neighborhood_isNPkVill',
 'Neighborhood_isNWAmes',
 'Neighborhood_isNoRidge',
 'Neighborhood_isNridgHt',
 'Neighborhood_isOldTown',
 'Neighborhood_isSWISU',
 'Neighborhood_isSawyer',
 'Neighborhood_isSawyerW',
 'Neighborhood_isSomerst',
 'Neighborhood_isStoneBr',
 'Neighborhood_isTimber',
 'Neighborhood

In [260]:
#Drop all Na's for now
data = data.dropna()
# Split into training and target sets
num_variables = len(data.columns)
training = data.iloc[:, 0:num_variables-1]
target = data.iloc[:,num_variables-1:]

#80:20 train test ratio
test_size = 0.2
#This function splits the training and target sets into random train and test subsets.
#X_train and X_test are subsets of the training data
#y_train and y_test are subsets the the target data
X_train, X_test, y_train, y_test = train_test_split(training, target, test_size=test_size)

In [261]:
from sklearn.ensemble import AdaBoostRegressor
adaBoost = AdaBoostRegressor()
adaBoost.fit(X_train, y_train.values.ravel())

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

In [262]:
def evaluateModel(model):
    start_time = timeit.default_timer()
    mae = cross_val_score(model, X_test, y_test.values.ravel(), cv=5, scoring='neg_mean_absolute_error')
    mae = np.mean(mae)
    print('Mean Absolute Error: ', -mae)
    
    mse = cross_val_score(model, X_test, y_test.values.ravel(), cv=5, scoring='neg_mean_squared_error')
    rmse = math.sqrt(np.mean(mse*-1))
    
    print('Root Mean Squared Error: ', rmse)

    elapsed = timeit.default_timer() - start_time


In [263]:
evaluateModel(adaBoost)

Mean Absolute Error:  36512.485333677614
Root Mean Squared Error:  49805.02207170645


In [264]:
#View Predicted values
predicted = adaBoost.predict(X_test)
y_test['predicted'] = predicted
y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,SalePrice,predicted
1458,142125,211632.433904
768,216837,256703.913580
862,152000,213043.471925
239,113000,172539.091723
584,133000,174385.539568
770,134900,231113.320388
285,164700,212520.256186
69,225000,196547.276498
337,214000,225738.738220
229,192500,225738.738220
