In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
warnings.filterwarnings('ignore')
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
hp_df=pd.read_csv('House Price Data.csv')
y=hp_df['SalePrice']

#Droping output variable from dataset

hp_df.drop(columns=['SalePrice','Id'],inplace=True)

nadf_info=pd.DataFrame({'col_name':hp_df.columns,'na_cnt':hp_df.isnull().sum(),'na_pct':(hp_df.isnull().sum()/hp_df.shape[0])*100})

cols_grt50_ls=list(nadf_info[nadf_info['na_pct']>50]['col_name'])

# Droping missing values greater than 50 % cols
hp_df.drop(columns=cols_grt50_ls,inplace=True)

#Splitting train& test data
x_train,x_test,y_train,y_test=train_test_split(hp_df,y,random_state=49,test_size=0.2)

con_cols=[col for col in hp_df.columns if hp_df[col].dtype=='int64' or hp_df[col].dtype=='float64']

cat_cols=[col for col in hp_df.columns if hp_df[col].dtype=='object']

## filling missing values for contnous columns 
for col in con_cols:
    x_train[col].fillna(x_train[col].mean(),inplace=True)
    x_test[col].fillna(x_train[col].mean(),inplace=True)

## filling missing values for catgorical columns 
    
for col in cat_cols:
    x_train[col].fillna(x_train[col].mode()[0],inplace=True)
    x_test[col].fillna(x_train[col].mode()[0],inplace=True)


In [3]:
# Scaling the cont cols
scalar=StandardScaler()

for col in con_cols:
    x_train[col]=scalar.fit_transform(np.array(x_train[col]).reshape(-1,1))
    x_test[col]=scalar.transform(np.array(x_test[col]).reshape(-1,1))


In [4]:
#one hot encoding for catg cols
cat_encd_train= pd.get_dummies(x_train[cat_cols])
cat_encd_test=pd.get_dummies(x_test[cat_cols])

In [5]:
# Aligning train data with test data so that un necessary cols encoded in test data get removed

cat_encd_train_final,cat_encd_test_final=cat_encd_train.align(cat_encd_test,join='inner',axis=1)
cat_encd_train_final.shape###aligning train & test data one hot encoded catg columns due to unqual no of columns i.e no of cilumns would differ for that we align to get same

(1168, 197)

In [6]:
# Craeting final train data to feed the model
x_train_final=pd.concat([x_train[con_cols],cat_encd_train_final],axis=1)


In [7]:
# Craeting final test data to feed the model
x_test_final=pd.concat([x_test[con_cols],cat_encd_test_final],axis=1)

In [8]:
# Linear reg object creation
lin_reg=LinearRegression()
lin_reg.fit(x_train_final,y_train)

LinearRegression()

In [9]:
# test data prediction
y_test_pred=lin_reg.predict(x_test_final)

In [10]:
# Score for test data
lin_reg.score(x_test_final,y_test)

0.8755431979271163

In [11]:
# score for train data
lin_reg.score(x_train_final,y_train)

0.8977182664016318

In [12]:
# R2 (Rsquare score)
r2_score(y_test,y_test_pred)

0.8755431979271163

In [13]:
# Lasso regularization
ls_reg=Lasso(alpha =1)
ls_reg.fit(x_train_final,y_train)
ls_reg.coef_[ls_reg.coef_==0]

array([ 0.,  0.,  0.,  0., -0., -0., -0.,  0., -0., -0., -0., -0.,  0.])

In [14]:
# Ridge regularization
lr_rid=Ridge(alpha=2)
lr_rid.fit(x_train_final,y_train)
len(lr_rid.coef_[lr_rid.coef_==0])

0

In [15]:
# SGD Regressor(stocastic gradient descent)
sgd=SGDRegressor(verbose=1)
sgd.fit(x_train_final,y_train)
y_test_sgd_pred=sgd.predict(x_test_final)
y_train_sgd_pred=sgd.predict(x_train_final)

-- Epoch 1
Norm: 51579.16, NNZs: 233, Bias: 8413.738101, T: 1168, Avg. loss: 1043199387.239838
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 56875.10, NNZs: 233, Bias: 9159.222585, T: 2336, Avg. loss: 646371819.535809
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 59376.22, NNZs: 233, Bias: 9135.163665, T: 3504, Avg. loss: 632866281.490674
Total training time: 0.03 seconds.
-- Epoch 4
Norm: 60910.33, NNZs: 233, Bias: 9272.285728, T: 4672, Avg. loss: 599221993.671714
Total training time: 0.03 seconds.
-- Epoch 5
Norm: 65553.92, NNZs: 233, Bias: 9928.453157, T: 5840, Avg. loss: 549001942.352187
Total training time: 0.05 seconds.
-- Epoch 6
Norm: 67600.69, NNZs: 233, Bias: 10007.699962, T: 7008, Avg. loss: 534862945.044929
Total training time: 0.05 seconds.
-- Epoch 7
Norm: 69107.36, NNZs: 233, Bias: 10108.439898, T: 8176, Avg. loss: 517129498.376063
Total training time: 0.05 seconds.
-- Epoch 8
Norm: 67897.56, NNZs: 233, Bias: 9962.836199, T: 9344, Avg. loss: 472540104.739358


In [16]:
#R2 score of test data using SGD
r2_score(y_test,y_test_sgd_pred)

0.8841417200354119

In [17]:
#R2 score of train data using SGD
r2_score(y_train,y_train_sgd_pred)

0.8722418418662226

In [18]:
#Random Forest Regressor bagging technique
rdfc=RandomForestRegressor(n_estimators=150)
rdfc.fit(x_train_final,y_train)
rdfc_test_pred=rdfc.predict(x_test_final)
rdfc_train_pred=rdfc.predict(x_train_final)

In [19]:
#R2 score of test data using RF
r2_score(y_test,rdfc_test_pred)

0.8944870430010764

In [20]:
#R2 score of train data using RF
r2_score(y_train,rdfc_train_pred)

0.9806832830369177

In [21]:
#Ada Boost Regressor technique
adrg=AdaBoostRegressor()
adrg.fit(x_train_final,y_train)
adrg_test_pred=adrg.predict(x_test_final)
adrg_train_pred=adrg.predict(x_train_final)

In [22]:
#R2 score of test data using AdaBoost Boosting technique
r2_score(y_test,adrg_test_pred)

0.8177911136637515

In [23]:
#R2 score of train data using AdaBoost Boosting technique
r2_score(y_train,adrg_train_pred)

0.8640491231056003

In [24]:
gbc=GradientBoostingClassifier()


dict={
    'n_estimators': [200,300,400],
    'max_depth':[5,8,10,15],
    'min_samples_split':[5,3,4,2],
    'min_samples_leaf':[1,2,3,4]
    } 

gvcv=GridSearchCV(gbc,dict,cv=5)

In [None]:
gvcv.fit(x_train_final,y_train)