In [129]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Lasso, LassoLarsCV, Ridge, RidgeCV
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [130]:
df = pd.read_csv('./DATA/advanced_housing.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [132]:
col_label = df.columns[df.isnull().sum() > 1]
df.drop(col_label, axis=1, inplace=True)

In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 63 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             1460 non-null   int64 
 1   MSSubClass     1460 non-null   int64 
 2   MSZoning       1460 non-null   object
 3   LotArea        1460 non-null   int64 
 4   Street         1460 non-null   object
 5   LotShape       1460 non-null   object
 6   LandContour    1460 non-null   object
 7   Utilities      1460 non-null   object
 8   LotConfig      1460 non-null   object
 9   LandSlope      1460 non-null   object
 10  Neighborhood   1460 non-null   object
 11  Condition1     1460 non-null   object
 12  Condition2     1460 non-null   object
 13  BldgType       1460 non-null   object
 14  HouseStyle     1460 non-null   object
 15  OverallQual    1460 non-null   int64 
 16  OverallCond    1460 non-null   int64 
 17  YearBuilt      1460 non-null   int64 
 18  YearRemodAdd   1460 non-null

In [134]:
df.dropna(inplace=True)

In [135]:
df.isnull().sum().sort_values(ascending=False)

SalePrice       0
OverallQual     0
BsmtUnfSF       0
BsmtFinSF2      0
BsmtFinSF1      0
               ..
LowQualFinSF    0
2ndFlrSF        0
1stFlrSF        0
Electrical      0
Id              0
Length: 63, dtype: int64

In [136]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [137]:
# scaler = MinMaxScaler()
sclaer = RobustScaler()
df['SalePrice'] = scaler.fit_transform(df['SalePrice'][:, np.newaxis])

In [138]:
df = pd.get_dummies(df)
y = df['SalePrice']
df.drop('SalePrice', axis=1, inplace=True)

In [139]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3)

In [140]:
y_train

915     0.055687
919     0.196639
210     0.087627
673     0.309124
905     0.129288
          ...   
1444    0.200944
163     0.094848
651     0.101514
986     0.114012
896     0.099431
Name: SalePrice, Length: 1021, dtype: float64

In [141]:
random_forest = RandomForestRegressor()
grad_boost = GradientBoostingRegressor()
ada_boost = AdaBoostRegressor()

In [142]:
random_forest.fit(X_train, y_train)
grad_boost.fit(X_train, y_train)
ada_boost.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None)

In [143]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error, r2_score

def test_model(model):
    model.fit(X_train, Y_train)
    predicted = model.predict(X_test)
    print('r2_score', r2_score(y_test, predicted))
    print('mean squared error', mean_squared_error(y_test, predicted))

In [128]:
test_model(random_forest)
test_model(grad_boost)
test_model(ada_boost)
test_model(ElasticNet)
test_model(Lasso)
test_model(LassoLarsCV)
test_model(Ridge)
test_mome

r2_score 0.8880093886057627
mean squared error 0.0015457108011785882
r2_score 0.8880751197583263
mean squared error 0.0015448035701953252
r2_score 0.8455466784734418
mean squared error 0.0021317873381464132
