In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [34]:
all_data = pd.read_csv("data/train.csv")

In [35]:
housing = all_data.copy()

In [36]:
corr_matrix = housing.corr()

In [37]:
housing_labels = housing["SalePrice"].copy()

In [38]:
features=["LotArea", "Neighborhood", "HouseStyle", "OverallQual", "GrLivArea", "OverallCond", 
          "MiscVal", "BldgType", "YearBuilt", "YearRemodAdd", "BsmtFinSF1", "KitchenQual", "BsmtQual",
         "1stFlrSF", "BsmtExposure", "LandContour"]

In [39]:
test_data = pd.read_csv("data/test.csv")

In [40]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [116]:

best_score = 100000
best_feature = ""
for col in housing.columns:
    if col in features or col == "SalePrice":
        pass
    else:
        new_features = features + [col]
        housing_sel = housing[new_features]
        #housing_sel['MSSubClass'] = housing_sel['MSSubClass'].apply(str)
        housing_sel=pd.get_dummies(housing_sel)
        
        X_test = pd.get_dummies(test_data[new_features])
        housing_sel, X_test = housing_sel.align(X_test, join='left', axis=1)
        X_test["HouseStyle_2.5Fin"].fillna(value=0, inplace=True)
        X_test.fillna(value=0, inplace=True)
        housing_sel.fillna(value=0, inplace=True)
        forest_reg = RandomForestRegressor(random_state=75)
        forest_reg.fit(housing_sel, housing_labels)
        forest_scores = cross_val_score(forest_reg, housing_sel, housing_labels,
                                       scoring="neg_mean_squared_error",
                                       cv=10)
        forest_rmse_scores=np.sqrt(-forest_scores)
        print(col)
        display_scores(forest_rmse_scores)
        print()
        print()
        if (forest_rmse_scores.mean() < best_score):
            best_score = forest_rmse_scores.mean()
            best_feature = col
            
print(best_feature)
print(best_score)

Id
Scores: [25533.95700987 27546.96894897 21534.26418502 35262.95411669
 32207.06225236 28475.48689088 23768.35191033 24026.86569242
 39954.91883149 28002.04248818]
Mean: 28631.287232622282
Standard deviation: 5398.108950277964


MSSubClass
Scores: [23740.55380956 27404.08622219 21422.43102497 35774.51400956
 32010.22228395 28601.12061351 24308.38085185 23870.68160581
 37355.24379534 27599.06247774]
Mean: 28208.629669445672
Standard deviation: 5065.314421213089


MSZoning
Scores: [23692.58938969 27305.55350516 21739.7596881  35342.96822767
 31410.832104   28447.47539473 24378.52295548 23434.33935132
 37167.17159461 27661.60378662]
Mean: 28058.08159974005
Standard deviation: 4919.939111297571


LotFrontage
Scores: [24341.95308548 27217.15807635 21476.87976386 34293.56979554
 32785.11715524 28371.06303488 23782.40486087 24305.43342801
 38936.38858573 27749.04928019]
Mean: 28325.901706615237
Standard deviation: 5188.763670745743


Street
Scores: [23665.75196919 27932.19190777 21681.548767

BsmtHalfBath
Scores: [23843.62961204 27535.51819428 21451.99044726 35790.14975217
 31378.96142859 28399.06353874 23983.47776502 23450.61346465
 39603.30124134 27575.13776834]
Mean: 28301.184321242676
Standard deviation: 5507.903492519896


FullBath
Scores: [24047.43060926 27063.35295935 21771.11171172 41409.90866044
 32016.95303218 28687.59927755 23195.03891674 22510.58644866
 36679.46880591 27445.16267706]
Mean: 28482.661309888055
Standard deviation: 6150.335213970531


HalfBath
Scores: [23925.4261513  27533.64029568 21412.13092879 36012.55217177
 31761.44656836 28256.53531064 22977.570559   23461.80497368
 37584.13897345 27338.96678855]
Mean: 28026.42127212302
Standard deviation: 5260.614610468538


BedroomAbvGr
Scores: [23977.43087285 27671.93568607 21415.0157214  33949.4410317
 31683.19270507 28501.93825215 24232.34038258 23426.18440949
 36806.5782626  27751.16499512]
Mean: 27941.522231902945
Standard deviation: 4707.047872474277


KitchenAbvGr
Scores: [23445.03708379 27314.9652169

In [41]:
housing_sel = housing[features]
#housing_sel['MSSubClass'] = housing_sel['MSSubClass'].apply(str)

housing_sel.info()

housing_sel=pd.get_dummies(housing_sel)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 16 columns):
LotArea         1460 non-null int64
Neighborhood    1460 non-null object
HouseStyle      1460 non-null object
OverallQual     1460 non-null int64
GrLivArea       1460 non-null int64
OverallCond     1460 non-null int64
MiscVal         1460 non-null int64
BldgType        1460 non-null object
YearBuilt       1460 non-null int64
YearRemodAdd    1460 non-null int64
BsmtFinSF1      1460 non-null int64
KitchenQual     1460 non-null object
BsmtQual        1423 non-null object
1stFlrSF        1460 non-null int64
BsmtExposure    1422 non-null object
LandContour     1460 non-null object
dtypes: int64(9), object(7)
memory usage: 182.6+ KB


In [42]:
test_data = pd.read_csv("data/test.csv")
X_test = pd.get_dummies(test_data[features])
#X_test['MSSubClass'] = X_test['MSSubClass'].apply(str)
X_test.shape
housing_sel, X_test = housing_sel.align(X_test, join='left', axis=1)
X_test["HouseStyle_2.5Fin"].fillna(value=0, inplace=True)
#X_test["GarageCars"].fillna(value=0, inplace=True)
X_test.fillna(value=0, inplace=True)


In [43]:
display(housing_sel.isnull().any())
housing_sel.fillna(value=0, inplace=True)

LotArea            False
OverallQual        False
GrLivArea          False
OverallCond        False
MiscVal            False
                   ...  
BsmtExposure_No    False
LandContour_Bnk    False
LandContour_HLS    False
LandContour_Low    False
LandContour_Lvl    False
Length: 63, dtype: bool

In [44]:
display(X_test.isnull().any())


LotArea            False
OverallQual        False
GrLivArea          False
OverallCond        False
MiscVal            False
                   ...  
BsmtExposure_No    False
LandContour_Bnk    False
LandContour_HLS    False
LandContour_Low    False
LandContour_Lvl    False
Length: 63, dtype: bool

In [45]:
sc = StandardScaler()
housing_tr = sc.fit_transform(housing_sel)
X_test = sc.transform(X_test)

In [46]:
housing_tr.shape

(1460, 63)

In [47]:
lin_reg = LinearRegression()
lin_reg.fit(housing_tr, housing_labels)
housing_predictions = lin_reg.predict(housing_tr)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

28961.990278945403

In [48]:
lin_scores = cross_val_score(lin_reg, housing_tr, housing_labels,
                            scoring="neg_mean_squared_error",
                            cv=10)
lin_rmse_scores=np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [2.14323996e+04 2.57480797e+04 2.32490596e+04 3.88275819e+04
 3.40699428e+04 2.78367143e+04 1.18652655e+15 2.43672757e+04
 5.20808909e+04 2.62232390e+04]
Mean: 118652655342578.42
Standard deviation: 355957965936456.9


In [49]:
forest_reg = RandomForestRegressor(random_state=75)
forest_reg.fit(housing_tr, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_tr, housing_labels,
                               scoring="neg_mean_squared_error",
                               cv=10)
forest_rmse_scores=np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)



Scores: [26879.39701602 27595.56342881 23269.945712   34959.46755708
 35196.55369359 29247.47159788 29827.09446448 23248.99080194
 35077.96515941 28588.31281743]
Mean: 29389.07622486443
Standard deviation: 4273.549232801709


In [50]:
xg_reg = xgb.XGBRegressor(random_state=75)
xg_reg.fit(housing_tr, housing_labels)
xg_scores = cross_val_score(xg_reg, housing_tr, housing_labels,
                               scoring="neg_mean_squared_error",
                               cv=10)
xg_rmse_scores=np.sqrt(-xg_scores)
display_scores(xg_rmse_scores)

Scores: [23194.78755816 27946.03272809 24582.93143204 43723.60365129
 32779.64243891 29490.32467892 24442.46810242 18932.69056583
 30013.03131879 28208.13777894]
Mean: 28331.36502534029
Standard deviation: 6363.464623565526


In [32]:
predictions=forest_reg.predict(X_test)
output=pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was saved!")

Your submission was saved!


In [22]:
forest_reg.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 75,
 'verbose': 0,
 'warm_start': False}

In [23]:
param_grid = [
    {'min_samples_leaf': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], 'random_state': [75]}
#    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features':[2,3,4]},
]

In [24]:
grid_search=GridSearchCV(forest_reg, param_grid, cv=10, scoring='neg_mean_squared_error',
                        return_train_score=True)
grid_search.fit(housing_tr, housing_labels)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=10, n_jobs=None,
                                             oob_score=False, random_state=75,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'min_samples_leaf': [1, 2, 3, 4, 5,

In [25]:
grid_search.best_params_

{'min_samples_leaf': 2, 'random_state': 75}

In [26]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

29698.165337206803 {'min_samples_leaf': 1, 'random_state': 75}
29357.7531660819 {'min_samples_leaf': 2, 'random_state': 75}
30569.815825444024 {'min_samples_leaf': 3, 'random_state': 75}
31510.57534454078 {'min_samples_leaf': 4, 'random_state': 75}
32475.656027869176 {'min_samples_leaf': 5, 'random_state': 75}
32968.79102741272 {'min_samples_leaf': 6, 'random_state': 75}
33257.477586449204 {'min_samples_leaf': 7, 'random_state': 75}
33474.83362198072 {'min_samples_leaf': 8, 'random_state': 75}
33388.55862948052 {'min_samples_leaf': 9, 'random_state': 75}
33474.09746313089 {'min_samples_leaf': 10, 'random_state': 75}
33180.35871028545 {'min_samples_leaf': 11, 'random_state': 75}
33506.918237519916 {'min_samples_leaf': 12, 'random_state': 75}
33620.521522962095 {'min_samples_leaf': 13, 'random_state': 75}
33706.76691680236 {'min_samples_leaf': 14, 'random_state': 75}
33813.717524868945 {'min_samples_leaf': 15, 'random_state': 75}
