In [497]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score

In [498]:
house = pd.read_csv('./train.csv', delimiter = ',')
test_data = pd.read_csv('./test.csv', delimiter = ',')

In [499]:
house = house.fillna(0)
house = house.drop(columns = ['Id'])
print(house.head(3))
print("\n")
print("\list of attributes:", list(house.columns))
print("\n")
print("total number of attributes: ", len(list(house.columns)) - 1)
print("shape of dataframe: ", house.shape)


   MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0          60       RL         65.0     8450   Pave     0      Reg   
1          20       RL         80.0     9600   Pave     0      Reg   
2          60       RL         68.0    11250   Pave     0      IR1   

  LandContour Utilities LotConfig  ... PoolArea PoolQC Fence MiscFeature  \
0         Lvl    AllPub    Inside  ...        0      0     0           0   
1         Lvl    AllPub       FR2  ...        0      0     0           0   
2         Lvl    AllPub    Inside  ...        0      0     0           0   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  
0       0      2    2008        WD         Normal     208500  
1       0      5    2007        WD         Normal     181500  
2       0      9    2008        WD         Normal     223500  

[3 rows x 80 columns]


\list of attributes: ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'Lot

In [500]:
test_data = test_data.fillna(0)
test_data = test_data.drop(columns = ['Id'])

In [501]:
cat_vars = []
for category in house:
    if pd.to_numeric(house[str(category)], errors='coerce').notnull().all() == False:
        cat_vars.append(str(category))

In [502]:
cat_test = []
for category in test_data:
    if pd.to_numeric(test_data[str(category)], errors='coerce').notnull().all() == False:
        cat_test.append(str(category))

In [503]:
house_dummies = pd.get_dummies(house, columns = cat_vars)

In [535]:
house_dummies

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1456,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,0,0,0,1,0,0,0,0,1,0
1457,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,0,0,0,1,0,0,0,0,1,0
1458,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,0,0,0,1,0,0,0,0,1,0


In [517]:
test_dummies = pd.get_dummies(test_data, columns = cat_test)

In [534]:
test_dummies

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,3,4,5,6,7,8,9,10,11,0
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0,0,0,0,0,0,0,0,0,0
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,21.0,1936,4,7,1970,1970,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1455,160,21.0,1894,4,5,1970,1970,0.0,252.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1456,20,160.0,20000,5,7,1960,1996,0.0,1224.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1457,85,62.0,10441,5,5,1992,1992,0.0,337.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [507]:
labels = house_dummies[['SalePrice']]
print("check labels: ", labels.head(3))

check labels:     SalePrice
0     208500
1     181500
2     223500


In [508]:
features = house_dummies.drop(columns = ['SalePrice'])

In [509]:
features

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1456,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,0,0,0,1,0,0,0,0,1,0
1457,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,0,0,0,1,0,0,0,0,1,0
1458,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,0,0,0,1,0,0,0,0,1,0


In [526]:
X = features
y = labels

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.05,random_state=0)

print("number of training samples: ", len(X_train))
print("number of test samples:", len(y_test))

number of training samples:  1387
number of test samples: 73


In [527]:
mlr = LinearRegression()
mlr.fit(X_train,y_train)
mlr_score = mlr.score(X_test,y_test)
pred_mlr = mlr.predict(X_test)
expl_mlr = explained_variance_score(pred_mlr,y_test)

In [528]:
pred_mlr

array([[252852.99509921],
       [149554.79548749],
       [ 99220.39752723],
       [205268.47710685],
       [ 87209.02654923],
       [ 64454.18558995],
       [266728.93099515],
       [122724.66136292],
       [488767.83983885],
       [151343.4986499 ],
       [200295.37294227],
       [108650.36677121],
       [237977.12942402],
       [103944.84913973],
       [107967.78388588],
       [142419.41218382],
       [248020.8152017 ],
       [133867.25831317],
       [142010.20605557],
       [181336.67486889],
       [142039.86331956],
       [174106.47149132],
       [102432.55742449],
       [154185.16702614],
       [181303.50216026],
       [149945.82351183],
       [162303.76004382],
       [ 63872.4740962 ],
       [325610.57312901],
       [118580.98859129],
       [155397.39751091],
       [201129.88292177],
       [150872.50223712],
       [296743.25119649],
       [340043.82331335],
       [214738.01478526],
       [311490.74685776],
       [124202.80033421],
       [2263

In [529]:
y_test

Unnamed: 0,SalePrice
529,200624
491,133000
459,110000
279,192000
655,88000
...,...
1274,139000
822,225000
315,188500
9,118000


In [530]:
from sklearn.tree import DecisionTreeRegressor
tr_regressor = DecisionTreeRegressor(random_state=0)
tr_regressor.fit(X_train,y_train)
tr_regressor.score(X_test,y_test)
pred_tr = tr_regressor.predict(X_test)
decision_score=tr_regressor.score(X_test,y_test)
expl_tr = explained_variance_score(pred_tr,y_test)

In [531]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=28,random_state=0)
rf_regressor.fit(X_train,y_train)
rf_regressor.score(X_test,y_test)
rf_pred =rf_regressor.predict(X_test)
rf_score=rf_regressor.score(X_test,y_test)
expl_rf = explained_variance_score(rf_pred,y_test)

In [532]:
print("Multiple Linear Regression Model Score is ",round(mlr.score(X_test,y_test)*100))
print("Decision tree  Regression Model Score is ",round(tr_regressor.score(X_test,y_test)*100))
print("Random Forest Regression Model Score is ",round(rf_regressor.score(X_test,y_test)*100))

#Let's have a tabular pandas data frame, for a clear comparison

models_score =pd.DataFrame({'Model':['Multiple Linear Regression','Decision Tree','Random forest Regression'],
                            'Score':[mlr_score,decision_score,rf_score],
                            'Explained Variance Score':[expl_mlr,expl_tr,expl_rf]
                           })
models_score.sort_values(by='Score',ascending=False)

Multiple Linear Regression Model Score is  84
Decision tree  Regression Model Score is  46
Random Forest Regression Model Score is  81


Unnamed: 0,Model,Score,Explained Variance Score
0,Multiple Linear Regression,0.843108,0.794145
2,Random forest Regression,0.813054,0.651145
1,Decision Tree,0.464884,-0.038151


In [None]:
test = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 
        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 
        'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
        'RoofStyle', 'RoofMatl', 'Exterior1st', 
        'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 
        'Functional', 'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageQual', 
        'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']