# **House Prices: Data Cleaning, Feature Engineering and Modeling**

In [1]:
#Imporing important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import sklearn.metrics as metrics
import math

In [1]:
df_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

#Making copr of the train and test dataset
c_test  = df_test.copy()
c_train  = df_train.copy()

In [1]:
#To display first 5 entries
df_train.head()

In [1]:
#To display first 5 entries
df_test.head()

In [1]:
#concatenating Train and test dataset

c_train['train']  = 1
c_test['train']  = 0

df = pd.concat([c_train, c_test], axis=0,sort=False)

df.head()

In [1]:
#Percentage of null values

df_col_name = df.columns
values = []

for col in df_col_name:
    values.append(((df[col].isnull().sum()/df.shape[0])*100))

df_nan = pd.DataFrame(df_col_name,columns=['Feature'])
df_nan['percent'] = values 
df_nan

In [1]:
#No. of features with more than 50% of missing values 

df_nan = df_nan[df_nan.percent > 50]
df_nan.sort_values("percent", ascending=False)

In [1]:
#Droping columns with more than 50% of missing values 

df = df.drop(['Alley','PoolQC','Fence','MiscFeature'],axis=1)

In [1]:
#Separating object and numerical feature

df_object = df.select_dtypes(include=['object'])
df_numerical =df.select_dtypes(exclude=['object'])

In [1]:
#Number of null values in each feature

null_counts = df_object.isnull().sum()
null_counts

In [1]:
#Filling missing values with 'none'
none_col = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','GarageType','GarageFinish','GarageQual',
            'FireplaceQu','GarageCond']
df_object[none_col]= df_object[none_col].fillna('None')

#Filling missing value with mode
df_object = df_object.fillna('mode()')

In [1]:
#Number of null values in each feature

null_counts = df_numerical.isnull().sum()
null_counts

In [1]:
#Filling missing values with forward_fill

df_numerical['LotFrontage'] = df_numerical['LotFrontage'].fillna(method='ffill')

#Filling missing values of 'GarageYrBlt' with 'YrSold'

df_numerical['GarageYrBlt'].fillna(df_numerical['YrSold'],inplace=True)

#Filling missing values with zero

df_numerical.fillna(0,inplace=True)

In [1]:
#Creating new column 'Age_House'

df_numerical['Age_House']= (df_numerical['YrSold']-df_numerical['YearBuilt'])

In [1]:
#To find if any negative value in Age_House

df_numerical[df_numerical['Age_House'] < 0]

In [1]:
#Since the Age_House is -1 filling its value with 2009

df_numerical.loc[df_numerical['YrSold'] < df_numerical['YearBuilt'],'YrSold' ] = 2009
df_numerical['Age_House']= (df_numerical['YrSold']-df_numerical['YearBuilt'])

In [1]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

le = LabelEncoder()

In [1]:
#Label encoding the object feature

for col in df_object.columns.tolist():
    df_object[col] = le.fit_transform(df_object[col])

In [1]:
#Concatenating object and numberical features

df_final = pd.concat([df_object, df_numerical], axis=1,sort=False)
df_final.head()

In [1]:
#Droping column 'Id'
df_final = df_final.drop(['Id',],axis=1)

#Separating train dataset and droping train column
df_train = df_final[df_final['train'] == 1]
df_train = df_train.drop(['train',],axis=1)

#Separating test dataset and droping train and SalePrice column
df_test = df_final[df_final['train'] == 0]
df_test = df_test.drop(['SalePrice'],axis=1)
df_test = df_test.drop(['train',],axis=1)

In [1]:
#Separting df_train in independent and dependent variable
X=df_train.drop(['SalePrice'],axis=1)
y=df_train['SalePrice']

In [1]:
#Splitting df_train in train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

#### Building model with Linear Regression:

In [1]:
# Importing LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Creating object and fitting the model
lin_reg = LinearRegression()
model = lin_reg.fit(X_train,y_train)


In [1]:
# Predicting for test dataset
y_pred = model.predict(X_test)

In [1]:
# Creating Actual and Predicted dataset 
df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1

In [1]:
# Model Evaluation 

# Importing metrics from sklearn 
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

# To find Mean Absolute Error(mse)
mse = (mean_absolute_error(y_test, y_pred))
print("MAE:",mse)

# To find Root Mean Squared Error(rmse)
rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
print("RMSE:",rmse)

# To find coefficient of determination
r2 =  r2_score(y_test, y_pred)
print("R-Square:",r2)

#### Using XGBoost and LGBM:

In [1]:
xgb =XGBRegressor( booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=4, min_child_weight=1.5, n_estimators=2400,
             n_jobs=1, nthread=None, objective='reg:linear',
             reg_alpha=0.6, reg_lambda=0.6, scale_pos_weight=1, 
             silent=None, subsample=0.8, verbosity=1)

xgb.fit(X_train, y_train)


In [1]:
predict1 = xgb.predict(X_test)

print('RMSE using XGBoost: ',math.sqrt(metrics.mean_squared_error(y_test, predict1)))

In [1]:
lgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=12000, 
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.4, 
                                       )

lgbm.fit(X_train, y_train,eval_metric='rmse')


In [1]:
predict2 = lgbm.predict(X_test)

print('RMSE using LGBM: ',math.sqrt(metrics.mean_squared_error(y_test, predict2)))

In [1]:
xgb.fit(X, y)
lgbm.fit(X, y,eval_metric='rmse')

In [1]:
predict3 = lgbm.predict(df_test)
predict4 = xgb.predict(df_test)

In [1]:
#Combining LGBM and XGBoost
predict5 = ( predict4*0.45 + predict3*0.55)

In [1]:
#To compare the predicted value from all the three predict
pd.DataFrame({'predict3' : predict3,'predict4' : predict4,'predict5' : predict5 },
             columns=['predict3','predict4', 'predict5'])

In [1]:
df_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [1]:
#To csv
submission = pd.DataFrame({
        "Id": df_test["Id"],
        "SalePrice": predict3
    })
submission.to_csv('submission3.csv', index=False)

In [1]:
#To csv
submission = pd.DataFrame({
        "Id": df_test["Id"],
        "SalePrice": predict4
    })
submission.to_csv('submission4.csv', index=False)

In [1]:
#To csv
submission = pd.DataFrame({
        "Id": df_test["Id"],
        "SalePrice": predict5
    })
submission.to_csv('submission5.csv', index=False)