In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.linear_model import LassoCV, ElasticNet, RidgeCV

In [2]:
#Displays score information

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [3]:
def get_best_features(features, model, overall_best_score, df, scale=False):
    #features is the list of features we are using
    #model is the model we are using to test
    #best_score is the best score we have acheived so far
    #df is the training dataframe
    best_score = 1000000
    best_feature = ""
    housing_labels = df["SalePrice"].copy()

    for col in df.columns:
        if col in features or col == "SalePrice":
            pass
        else:
            new_features = features + [col]
            housing_sel = df[new_features]
            housing_sel=pd.get_dummies(housing_sel)
            housing_sel.fillna(value=0, inplace=True)
            if scale == True:
                sc = StandardScaler()
                housing_sel = sc.fit_transform(housing_sel)
            model.fit(housing_sel, housing_labels)
            model_scores = cross_val_score(model, housing_sel, housing_labels,
                                          scoring="neg_mean_squared_error",
                                          cv=5)
            rmse_scores = np.sqrt(-model_scores)
            print(col)
            print(rmse_scores.mean())
            print()
            if (rmse_scores.mean() < best_score):
                best_score = rmse_scores.mean()
                best_feature = col
                
    print("The best feature from this test: ",best_feature)
    print("The best score from this test: ", best_score)
    print("The overall best score: ", overall_best_score)
    
    if best_score < overall_best_score:
        return(get_best_features(features + [best_feature], model, best_score, df))
    else:
        return(features)
    

In [4]:
all_data=pd.read_csv("data/train.csv")

In [5]:
housing=all_data.copy()

In [6]:
housing_labels=housing["SalePrice"].copy()

In [7]:
features=['OverallQual', 'Neighborhood', 'GrLivArea', 'BsmtFinSF1', 'YearBuilt', 'KitchenQual', 'GarageArea', '2ndFlrSF',  'OverallCond', 'BsmtFinType1', 'BsmtQual', 'LandContour']

In [8]:
test_data = pd.read_csv("data/test.csv")

In [9]:
housing.shape

(1460, 81)

In [10]:
new_features = get_best_features([], RidgeCV(), 100000, housing, scale=True)




Id
79362.6297406269

MSSubClass
79001.70930150329

MSZoning
74976.09036821773

LotFrontage
77600.65534750375

LotArea
77535.2797373299

Street
79253.99101809086

Alley
78534.96500319568

LotShape
76388.51706469395

LandContour
78403.04309801127

Utilities
79281.22647085828

LotConfig
78758.57766642906

LandSlope
79223.93301533182

Neighborhood
54090.56502751021

Condition1
78237.2446410632

Condition2
79281.65473698155

BldgType
77914.3647082914

HouseStyle
76006.81364215989

OverallQual
48492.46435719534

OverallCond
79084.6467782091

YearBuilt
67561.86664454035

YearRemodAdd
68376.14157026162

RoofStyle
77089.81265641001

RoofMatl
78224.0332265812

Exterior1st
73832.81202736395

Exterior2nd
74187.34250705762

MasVnrType
71700.65971756834

MasVnrArea
70062.0625955928

ExterQual
57612.42569812563

ExterCond
78624.50164755297

Foundation
68463.55017769293

BsmtQual
57974.16725104685

BsmtCond
77254.02380087427

BsmtExposure
73430.12924364688

BsmtFinType1
70555.67946792646

BsmtFinSF1
7

BsmtFinSF1
35920.470695354685

BsmtFinType2
37254.08539292162

BsmtFinSF2
37289.286136432645

BsmtUnfSF
37146.580000791095

TotalBsmtSF
36387.26129545063

Heating
37319.398837807974

HeatingQC
37085.014965604

CentralAir
37182.36712039394

Electrical
37269.2257977003

1stFlrSF
36338.61919551273

2ndFlrSF
36404.35582560713

LowQualFinSF
37465.703642413566

BsmtFullBath
36254.85096554426

BsmtHalfBath
37272.142604809975

FullBath
37376.59481622643

HalfBath
37268.28593070203

BedroomAbvGr
36877.97781296124

KitchenAbvGr
36933.65618969237

KitchenQual
35548.69018046699

TotRmsAbvGrd
37312.40239543328

Functional
37234.059631588774

Fireplaces
36896.2024087615

FireplaceQu
37026.31854728494

GarageType
37349.945740481315

GarageYrBlt
37280.298874840046

GarageFinish
36714.782431936765

GarageCars
36513.429491604285

GarageArea
36433.79513043177

GarageQual
37292.240828733906

GarageCond
37249.750290846634

PavedDrive
37244.185797596634

WoodDeckSF
36889.79484789136

OpenPorchSF
37262.34094

PoolQC
33013.479876529156

Fence
32702.637343291357

MiscFeature
32652.527276989327

MiscVal
32631.795519935677

MoSold
32679.81519379891

YrSold
32632.9336672609

SaleType
32665.82683665423

SaleCondition
32634.89299738568

The best feature from this test:  KitchenQual
The best score from this test:  31595.36522950019
The overall best score:  32626.589416579656
Id
31578.516248846794

MSZoning
31565.07739084862

LotFrontage
31720.856752491574

LotArea
31365.762443968666

Street
31642.5568533614

Alley
31664.704982432653

LotShape
32017.769359725185

LandContour
31665.687193204863

Utilities
31593.550140521365

LotConfig
31435.057880011864

LandSlope
31717.90889897952

Condition1
31676.36546029752

Condition2
31903.545317769014

BldgType
31305.742471426656

HouseStyle
31667.757176838735

OverallCond
31174.93257914049

YearBuilt
31502.593246757235

YearRemodAdd
31431.56547904822

RoofStyle
31690.335973459943

RoofMatl
31436.001560887216

Exterior1st
31644.579215755948

Exterior2nd
31937.

1stFlrSF
30489.657865495974

2ndFlrSF
30507.658130839372

LowQualFinSF
30537.785391044803

BsmtFullBath
30282.84885777377

BsmtHalfBath
30381.365171495454

FullBath
30326.249977008738

HalfBath
30414.430000090564

BedroomAbvGr
30446.73497122639

KitchenAbvGr
30388.02750857753

TotRmsAbvGrd
30390.26361906228

Functional
30281.046589495498

Fireplaces
30176.393125071678

FireplaceQu
30466.429615662164

GarageType
30346.76629379832

GarageYrBlt
30330.16347813035

GarageFinish
30299.592726131996

GarageArea
30588.3575044191

GarageQual
30261.81011391349

GarageCond
30315.000429357908

PavedDrive
30380.653439420694

WoodDeckSF
30313.482092423616

OpenPorchSF
30414.841132218025

EnclosedPorch
30385.888295170753

3SsnPorch
30398.092268077824

ScreenPorch
30247.19566547341

PoolArea
30403.084227785213

PoolQC
30672.17890495412

Fence
30432.53580386286

MiscFeature
30389.6158438387

MiscVal
30376.571142696106

MoSold
30432.88592684599

YrSold
30368.494860685372

SaleType
30435.034417001927

Sal

MasVnrType
29400.686092351283

MasVnrArea
29389.263054296527

ExterQual
29245.750944742605

ExterCond
29355.875886662292

Foundation
29434.36554217843

BsmtCond
29339.358942492097

BsmtFinSF1
29825.75694424344

BsmtFinType2
29437.753741934073

BsmtFinSF2
29364.31889567561

BsmtUnfSF
29383.309888626292

TotalBsmtSF
29772.97877772004

Heating
29393.68567634871

HeatingQC
29293.63131897094

CentralAir
29357.488530713214

Electrical
29506.610269699904

1stFlrSF
29453.95635922253

2ndFlrSF
29468.065684103163

LowQualFinSF
29521.85458102861

BsmtFullBath
29317.24270861853

BsmtHalfBath
29375.030324453266

FullBath
29235.995870726987

HalfBath
29402.07122677547

BedroomAbvGr
29394.929776369234

KitchenAbvGr
29335.832064005568

TotRmsAbvGrd
29400.66941370705

Functional
29386.44343443176

FireplaceQu
29568.76648784748

GarageType
29359.71679529394

GarageYrBlt
29322.35495471766

GarageFinish
29357.975357757077

GarageArea
29482.488447974338

GarageQual
29289.99503490345

GarageCond
29419.23498

MasVnrArea
29025.776429058646

ExterCond
28911.49658844772

Foundation
29076.58123192327

BsmtCond
28908.6921025015

BsmtFinSF1
29501.95118960326

BsmtFinType2
29002.233170767933

BsmtFinSF2
28908.282713507466

BsmtUnfSF
28995.130323365716

TotalBsmtSF
29440.164461472385

Heating
28928.233844352362

HeatingQC
28917.911934844607

CentralAir
28920.83387251626

Electrical
28945.992200470926

1stFlrSF
29065.834055263356

2ndFlrSF
29067.553262821697

LowQualFinSF
29096.70643985023

BsmtFullBath
28851.072767517755

BsmtHalfBath
28993.019826468895

FullBath
28891.367272265994

HalfBath
29016.085594003693

BedroomAbvGr
28929.11955529452

KitchenAbvGr
28898.36796600905

TotRmsAbvGrd
28949.365069216128

Functional
28964.206651139502

FireplaceQu
29164.813819307066

GarageType
28957.952122695162

GarageYrBlt
28833.690710687057

GarageFinish
28936.1455931557

GarageArea
29237.34529936145

GarageQual
28934.224410310435

GarageCond
28855.12483959556

PavedDrive
28913.918059417578

WoodDeckSF
28982.6

1stFlrSF
28705.6802020681

2ndFlrSF
28705.888112686353

LowQualFinSF
28734.521448232506

BsmtHalfBath
28618.579197182728

HalfBath
28627.59827093718

BedroomAbvGr
28734.875855816896

KitchenAbvGr
28717.472324808896

TotRmsAbvGrd
28775.381329729273

FireplaceQu
28803.255223306878

GarageType
28652.199475845537

GarageFinish
28723.899166596057

GarageArea
28770.815980990108

GarageQual
28614.90037925497

GarageCond
28758.992412346484

PavedDrive
28615.601013701285

WoodDeckSF
28607.094116662764

OpenPorchSF
28778.92274497549

EnclosedPorch
28633.68621501

3SsnPorch
28640.19565818882

PoolArea
28944.721497560873

PoolQC
29122.831245562906

Fence
28663.216662059014

MiscFeature
28743.600957034585

MiscVal
28755.403379083193

MoSold
28676.754976233096

YrSold
28608.347354655154

SaleType
28842.34975242453

SaleCondition
28637.54369427126

The best feature from this test:  HeatingQC
The best score from this test:  28577.13200576724
The overall best score:  28615.793032309237
Id
28584.0631740

MSZoning
28634.188964206573

LotFrontage
28630.716201561423

Alley
28552.640796972315

LotShape
28932.225280853636

LandContour
28653.599232840577

Utilities
28577.326524531243

LandSlope
28828.530922038964

Condition1
28528.468927501108

Condition2
29012.926996990158

HouseStyle
28746.819427106308

YearRemodAdd
28518.897871532663

RoofStyle
28824.89413928025

Exterior1st
28716.174806593255

Exterior2nd
28973.341000891483

MasVnrType
28677.522774513945

MasVnrArea
28624.877361259558

ExterCond
28651.948773571447

Foundation
28689.03969186009

BsmtCond
28580.40085726075

BsmtFinSF1
29143.13932275359

BsmtFinType2
28716.348073937268

BsmtFinSF2
28588.719270358386

BsmtUnfSF
28598.56122496532

TotalBsmtSF
29067.185634355807

Heating
28531.501441740606

CentralAir
28527.622748242506

Electrical
28668.94106376653

1stFlrSF
28608.67332482992

2ndFlrSF
28608.326069688723

LowQualFinSF
28672.729465863224

BsmtHalfBath
28521.771432463975

HalfBath
28611.31002476072

BedroomAbvGr
28643.632793137

Id
28510.17294715248

MSZoning
28511.633984985023

LotFrontage
28798.377886320075

Alley
28538.463332736515

LotShape
28934.3841537168

LandContour
28642.13502479323

LandSlope
28805.525207248276

Condition1
28514.70431179949

Condition2
28977.32424009755

HouseStyle
28699.464916259145

YearRemodAdd
28498.355948047352

RoofStyle
28862.738906644663

Exterior1st
28560.34442022742

Exterior2nd
28828.57936949713

MasVnrType
28508.732009918283

MasVnrArea
28592.56710005632

ExterCond
28523.059615334943

Foundation
28673.707376997627

BsmtFinSF1
29133.794940881482

BsmtFinType2
28554.074559176584

BsmtUnfSF
28574.347631526885

TotalBsmtSF
29029.06123727307

Heating
28517.927217320655

CentralAir
28505.433508214064

Electrical
28527.281708910235

1stFlrSF
28590.700391935457

2ndFlrSF
28656.804573523485

LowQualFinSF
28600.60409491129

BsmtHalfBath
28502.32418258203

HalfBath
28595.246824849455

BedroomAbvGr
28502.058379966486

TotRmsAbvGrd
28790.569832565416

FireplaceQu
28715.73738897521

Ga

In [None]:
print(new_features)

In [None]:
#make the dummies sets
housing_sel = housing[features]
housing_sel=pd.get_dummies(housing_sel)
housing_sel.shape

In [None]:
X_test = pd.get_dummies(test_data[features])
housing_sel, X_test = housing_sel.align(X_test, join='left', axis=1)
X_test.fillna(value=0, inplace=True)
housing_sel.fillna(value=0, inplace=True)
X_test.shape


In [None]:
#Here we will run a few models with the data....
housing_tr = housing_sel

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(housing_tr, housing_labels)
housing_predictions = lin_reg.predict(housing_tr)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse=np.sqrt(lin_mse)
lin_rmse
lin_scores = cross_val_score(lin_reg, housing_tr, housing_labels,
                            scoring="neg_mean_squared_error",
                            cv=10)
lin_rmse_scores=np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
forest_reg = RandomForestRegressor(random_state=75)
forest_reg.fit(housing_tr, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_tr, housing_labels,
                               scoring="neg_mean_squared_error",
                               cv=5)
forest_rmse_scores=np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
predictions=forest_reg.predict(X_test)
output=pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was saved!")

In [None]:
xg_reg = xgb.XGBRegressor(random_state=75)
xg_reg.fit(housing_tr, housing_labels)
xg_scores = cross_val_score(xg_reg, housing_tr, housing_labels,
                               scoring="neg_mean_squared_error",
                               cv=10)
xg_rmse_scores=np.sqrt(-xg_scores)
display_scores(xg_rmse_scores)
predictions=xg_reg.predict(X_test)
output=pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was saved!")