In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
house_price_df=pd.read_csv('train.csv')

y=house_price_df['SalePrice']

house_price_df.drop(columns='SalePrice',inplace=True)

house_price_df['MSZoning'].value_counts()

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64

In [2]:
na_df=pd.DataFrame({'Column_name':house_price_df.columns,
                   'NA_Count':house_price_df.isnull().sum(),
                   'NA_Percentage':house_price_df.isnull().sum()/house_price_df.shape[0]*100})

na_df.sort_values(by='NA_Percentage',ascending=False)

cols_grt_50pc_nas=list(na_df[na_df['NA_Percentage']>50]['Column_name'])

cols_grt_50pc_nas

['Alley', 'PoolQC', 'Fence', 'MiscFeature']

In [3]:
cols_with_unique_same_values=[]

for col in house_price_df.columns:
    if len(house_price_df[col].value_counts())==house_price_df.shape[0] or len(house_price_df[col].value_counts())==1:
        cols_with_unique_same_values.append(col)

cols_to_drop=cols_grt_50pc_nas+cols_with_unique_same_values
        

In [4]:
house_price_df.drop(columns=cols_to_drop,inplace=True)

In [5]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(house_price_df,y,test_size=0.2,random_state=42)


In [6]:
num_cols=[col for col in house_price_df.columns if house_price_df[col].dtype=='int64' or house_price_df[col].dtype=='float64']

cat_cols=[col for col in house_price_df.columns if house_price_df[col].dtype=='object']

In [7]:
for col in num_cols:
    X_train[col]=X_train[col].fillna(X_train[col].mean())
    X_test[col]=X_test[col].fillna(X_train[col].mean())

In [8]:
for col in cat_cols:
    X_train[col]=X_train[col].fillna(X_train[col].mode()[0])
    X_test[col]=X_test[col].fillna(X_train[col].mode()[0])
    

In [9]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import numpy as np

min_max_scaler=MinMaxScaler()

for col in num_cols:
    X_train[col]=min_max_scaler.fit_transform(np.array(X_train[col]).reshape(-1,1))
    X_test[col]=min_max_scaler.transform(np.array(X_test[col]).reshape(-1,1))

In [10]:
oe_train_df=pd.get_dummies(X_train[cat_cols])
oe_test_df=pd.get_dummies(X_test[cat_cols])

In [11]:
X_train_oe,X_test_oe=oe_train_df.align(oe_test_df,axis=1,join='inner',fill_value=0)

In [12]:
X_train_final=pd.concat([X_train_oe,X_train[num_cols]],axis=1)
X_test_final=pd.concat([X_test_oe,X_test[num_cols]],axis=1)

In [13]:
X_train_final.shape

(1168, 237)

In [14]:
import statsmodels

In [15]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X_train[num_cols].values, i) for i in range(X_train[num_cols].shape[1])]
vif["features"] = X_train[num_cols].columns

In [16]:
vif

Unnamed: 0,VIF Factor,features
0,2.939589,MSSubClass
1,8.988685,LotFrontage
2,2.208728,LotArea
3,45.885216,OverallQual
4,17.225876,OverallCond
5,50.917741,YearBuilt
6,8.401613,YearRemodAdd
7,1.889284,MasVnrArea
8,inf,BsmtFinSF1
9,inf,BsmtFinSF2


In [17]:
from sklearn.linear_model import LinearRegression

linreg=LinearRegression()

linreg.fit(X_train_final,y_train)

pred=linreg.predict(X_test_final)

In [18]:
from sklearn.metrics import r2_score,mean_squared_error

r2_score(y_test,pred)

mean_squared_error(y_test,pred)



958978862.2910959

In [19]:
from sklearn.tree import DecisionTreeRegressor

dtr=DecisionTreeRegressor(random_state=42)

dtr.fit(X_train_final,y_train)

dtr.predict(X_test_final)

print('Training Score:', dtr.score(X_train_final,y_train))

print('Testing score:', dtr.score(X_test_final,y_test)) #Case of Overfitting

Training Score: 1.0
Testing score: 0.7920591445420031


In [20]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 
hyp_dict={
           'max_depth':[15,18],
           'min_samples_split':[8,10,14],
           'max_leaf_nodes':[10,12], 
            'max_features':['sqrt','log2'] }
gcv=RandomizedSearchCV(estimator=dtr,param_distributions=hyp_dict, cv=10,n_iter=20)

gcv.fit(X_train_final,y_train)

gcv.best_params_
gcv.best_score_

dtc_gcv_pred=gcv.predict(X_test_final)

In [21]:
gcv.best_score_

0.506477276510297

In [22]:
gcv.best_params_

{'min_samples_split': 8,
 'max_leaf_nodes': 10,
 'max_features': 'sqrt',
 'max_depth': 18}

In [23]:
from sklearn.metrics import r2_score

r2_score(y_test,dtc_gcv_pred)

0.6055205531868375

In [24]:
from sklearn.ensemble import RandomForestRegressor

rf=RandomForestRegressor(random_state=42,oob_score=True)
rf.fit(X_train_final,y_train)

rf_pred=rf.predict(X_test_final)
r2_score(y_test,rf_pred)

0.8912773572512147

In [25]:
hyp_dict={ 'max_depth':[10,12,13,16,15,18],
           'min_samples_split':[8,10,14],
            'max_leaf_nodes':[10,12],
            'max_features':['sqrt','log2'],
            'n_estimators':[100,150,170,180]
         }

gcv_rf=GridSearchCV(estimator=rf,param_grid=hyp_dict,cv=10)

gcv_rf.fit(X_train_final,y_train)

gcv_rf.best_params_
gcv_rf.best_score_

rf_gcv_pred=gcv_rf.predict(X_test_final)

In [26]:
from sklearn.linear_model import SGDRegressor

sgd=SGDRegressor(loss='squared_loss', max_iter=1000, eta0=0.0001,learning_rate='invscaling',warm_start=True,verbose=3)

sgd.fit(X_train_final,y_train)

-- Epoch 1
Norm: 18168.34, NNZs: 237, Bias: 3321.715635, T: 1168, Avg. loss: 10051594549.913202
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 24456.47, NNZs: 237, Bias: 4455.166003, T: 2336, Avg. loss: 4608869407.991362
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 27808.10, NNZs: 237, Bias: 5044.779093, T: 3504, Avg. loss: 3239727039.482975
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 29822.15, NNZs: 237, Bias: 5386.139585, T: 4672, Avg. loss: 2738878269.628229
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 31110.72, NNZs: 237, Bias: 5592.221655, T: 5840, Avg. loss: 2515566786.325790
Total training time: 0.01 seconds.
-- Epoch 6
Norm: 31996.05, NNZs: 237, Bias: 5722.901505, T: 7008, Avg. loss: 2395065492.098486
Total training time: 0.01 seconds.
-- Epoch 7
Norm: 32661.28, NNZs: 237, Bias: 5812.379693, T: 8176, Avg. loss: 2317352341.021859
Total training time: 0.01 seconds.
-- Epoch 8
Norm: 33124.49, NNZs: 237, Bias: 5863.539611, T: 9344, Avg. loss: 2260651844.0

Norm: 55830.39, NNZs: 237, Bias: 6488.004516, T: 281488, Avg. loss: 994454994.199224
Total training time: 0.18 seconds.
-- Epoch 242
Norm: 55882.23, NNZs: 237, Bias: 6490.098829, T: 282656, Avg. loss: 993208819.194085
Total training time: 0.19 seconds.
-- Epoch 243
Norm: 55934.01, NNZs: 237, Bias: 6492.202886, T: 283824, Avg. loss: 991967139.217240
Total training time: 0.19 seconds.
-- Epoch 244
Norm: 55987.73, NNZs: 237, Bias: 6494.943635, T: 284992, Avg. loss: 990726822.317061
Total training time: 0.19 seconds.
-- Epoch 245
Norm: 56038.54, NNZs: 237, Bias: 6496.812756, T: 286160, Avg. loss: 989501840.545599
Total training time: 0.19 seconds.
-- Epoch 246
Norm: 56089.76, NNZs: 237, Bias: 6498.834484, T: 287328, Avg. loss: 988278837.072924
Total training time: 0.19 seconds.
-- Epoch 247
Norm: 56140.05, NNZs: 237, Bias: 6500.599230, T: 288496, Avg. loss: 987055802.863384
Total training time: 0.19 seconds.
-- Epoch 248
Norm: 56192.12, NNZs: 237, Bias: 6502.957905, T: 289664, Avg. loss: 9

Norm: 66522.47, NNZs: 237, Bias: 6914.818209, T: 584000, Avg. loss: 789554768.809460
Total training time: 0.38 seconds.
-- Epoch 501
Norm: 66555.80, NNZs: 237, Bias: 6916.011507, T: 585168, Avg. loss: 789055075.794594
Total training time: 0.38 seconds.
-- Epoch 502
Norm: 66588.69, NNZs: 237, Bias: 6917.049913, T: 586336, Avg. loss: 788553143.231804
Total training time: 0.38 seconds.
-- Epoch 503
Norm: 66622.43, NNZs: 237, Bias: 6918.428056, T: 587504, Avg. loss: 788051533.660461
Total training time: 0.38 seconds.
-- Epoch 504
Norm: 66655.64, NNZs: 237, Bias: 6919.619440, T: 588672, Avg. loss: 787555357.189747
Total training time: 0.38 seconds.
-- Epoch 505
Norm: 66689.36, NNZs: 237, Bias: 6921.015935, T: 589840, Avg. loss: 787056070.250412
Total training time: 0.39 seconds.
-- Epoch 506
Norm: 66722.80, NNZs: 237, Bias: 6922.317835, T: 591008, Avg. loss: 786561372.070583
Total training time: 0.39 seconds.
-- Epoch 507
Norm: 66755.11, NNZs: 237, Bias: 6923.220304, T: 592176, Avg. loss: 7

Norm: 70494.35, NNZs: 237, Bias: 7046.522627, T: 735840, Avg. loss: 734169278.731246
Total training time: 0.56 seconds.
-- Epoch 631
Norm: 70522.89, NNZs: 237, Bias: 7047.583893, T: 737008, Avg. loss: 733809248.884665
Total training time: 0.56 seconds.
-- Epoch 632
Norm: 70550.44, NNZs: 237, Bias: 7048.262367, T: 738176, Avg. loss: 733450180.757627
Total training time: 0.56 seconds.
-- Epoch 633
Norm: 70578.45, NNZs: 237, Bias: 7049.141462, T: 739344, Avg. loss: 733093008.381746
Total training time: 0.56 seconds.
-- Epoch 634
Norm: 70607.05, NNZs: 237, Bias: 7050.266471, T: 740512, Avg. loss: 732733371.254499
Total training time: 0.56 seconds.
-- Epoch 635
Norm: 70634.60, NNZs: 237, Bias: 7050.982465, T: 741680, Avg. loss: 732378036.850389
Total training time: 0.56 seconds.
-- Epoch 636
Norm: 70662.70, NNZs: 237, Bias: 7051.928742, T: 742848, Avg. loss: 732022904.497232
Total training time: 0.57 seconds.
-- Epoch 637
Norm: 70690.26, NNZs: 237, Bias: 7052.675385, T: 744016, Avg. loss: 7

Norm: 75575.42, NNZs: 237, Bias: 7185.385853, T: 974112, Avg. loss: 674851968.161119
Total training time: 0.75 seconds.
-- Epoch 835
Norm: 75597.40, NNZs: 237, Bias: 7185.841603, T: 975280, Avg. loss: 674617479.368888
Total training time: 0.75 seconds.
-- Epoch 836
Norm: 75619.24, NNZs: 237, Bias: 7186.247544, T: 976448, Avg. loss: 674383155.257170
Total training time: 0.75 seconds.
-- Epoch 837
Norm: 75641.00, NNZs: 237, Bias: 7186.627975, T: 977616, Avg. loss: 674148894.834019
Total training time: 0.75 seconds.
-- Epoch 838
Norm: 75663.14, NNZs: 237, Bias: 7187.176178, T: 978784, Avg. loss: 673916788.683707
Total training time: 0.75 seconds.
-- Epoch 839
Norm: 75685.16, NNZs: 237, Bias: 7187.677458, T: 979952, Avg. loss: 673682625.259577
Total training time: 0.75 seconds.
-- Epoch 840
Norm: 75706.58, NNZs: 237, Bias: 7187.938633, T: 981120, Avg. loss: 673446075.717628
Total training time: 0.76 seconds.
-- Epoch 841
Norm: 75728.64, NNZs: 237, Bias: 7188.473925, T: 982288, Avg. loss: 6

SGDRegressor(eta0=0.0001, verbose=3, warm_start=True)