In [None]:
#Library for mathematical computations
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Library for Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error 

#Libraries for Normalization and Encoding
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures

#Libraries for Deep Learning
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

#Library for Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%matplotlib inline

#Importing data files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Loading data as Training and Testing sets
train_data=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
#Displaying Training Data
train_data.head(10)

In [None]:
#Displaying Test Data
test_data.head(10)

# Preparing Training Data

**Cleaning Data**

Most of time the dataset being used are not cleaned. The most common process of data cleaning is handling missing data.

First, the quantity of data missing from columns will be checked, then the columns with more than 80% missing data will be removed and columns with missiing data will be filled with replacement if the data column is categorical and will be filled with median or average or 0 depending upon the type of numerical data. 

Checking for outlier is also one of the major steps of Data Cleaning. An Outlier is a data point that is very bigger or smaller than the next nearest datapoint. Having a outlier in the data can make huge difference in how the model will learn and results might be skewd. 

In [None]:
#Displaying Metadata/Information about Training Data
train_data.info()

In [None]:
#Displaying the Stats of the data columns
train_data.describe()

In [None]:
#Checking columns with missing data
Miss_Percent=100*(train_data.isnull().sum()/len(train_data))

#Sorting the data columns by their percentage in descending order
Miss_Percent=Miss_Percent[Miss_Percent>0].sort_values(ascending=False).round(1)

#Creating a dataframe to show percentage of missing data and its respective data column in table
DataFrame=pd.DataFrame(Miss_Percent)
miss_percent_table=DataFrame.rename(columns={0:'% of Missing Values'})
MissPercent=miss_percent_table

#Displaying Missing Value table
MissPercent

In [None]:
#Dropping Columns with more than 80% missing values and ID because ID will not have bigger
#impact predicting the sales price 
train_data=train_data.drop(['Id','Alley','PoolQC','MiscFeature','Fence'],axis=1)

**Checking and Filling Missing Data Columns**

In [None]:
#Checking Fireplace column
train_data['FireplaceQu'].unique()

In [None]:
#Replacing Null value in FirePlaceQu with None
train_data['FireplaceQu']=train_data['FireplaceQu'].fillna('None')

In [None]:
#Checking LotFrontage column
train_data['LotFrontage'].unique()

In [None]:
#Replacing Null Values in LotFrontage with median
train_data['LotFrontage']=train_data['LotFrontage'].fillna(train_data['LotFrontage'].median())

In [None]:
#Checking GarageCond column
train_data['GarageCond'].unique()

In [None]:
#Replacing Null value in GarageCond with None
train_data['GarageCond']=train_data['GarageCond'].fillna('None')

In [None]:
#Checking GarageQual column
train_data['GarageQual'].unique()

In [None]:
#Replacing the null value with none in GarageQual
train_data['GarageQual']=train_data['GarageQual'].fillna('None')

In [None]:
#Checking GarageFinish Column
train_data['GarageFinish'].unique()

In [None]:
#Replacing the Null value with None
train_data['GarageFinish']=train_data['GarageFinish'].fillna('None')

In [None]:
#Checking GarageYrBlt column
train_data['GarageYrBlt'].unique()

In [None]:
#Replacing the Null Value with 
train_data['GarageYrBlt']=train_data['GarageYrBlt'].fillna(train_data['GarageYrBlt'].median())

In [None]:
#Checking Garage Type column
train_data['GarageType'].unique()

In [None]:
#Replacing Null Value with None
train_data['GarageType']=train_data['GarageType'].fillna('None')

In [None]:
#Checking BsmtFinType2 column
train_data['BsmtFinType2'].unique()

In [None]:
#Replacing Null value with None
train_data['BsmtFinType2']=train_data['BsmtFinType2'].fillna('None')

In [None]:
#Checking BsmtFinType1 column
train_data['BsmtFinType1'].unique()

In [None]:
#Replacing Null Value with None
train_data['BsmtFinType1']=train_data['BsmtFinType1'].fillna("None")

In [None]:
#Checking BsmtExposure Column
train_data['BsmtExposure'].unique()

In [None]:
#Replacing null value with None
train_data['BsmtExposure']=train_data['BsmtExposure'].fillna('None')

In [None]:
#Checking BsmtCond Column
train_data['BsmtCond'].unique()

In [None]:
#Replacing Null Value with None
train_data['BsmtCond']=train_data['BsmtCond'].fillna('None')

In [None]:
#Checking BsmtQual column
train_data['BsmtQual'].unique()

In [None]:
#Replacing Null value with None
train_data['BsmtQual']=train_data['BsmtQual'].fillna('None')

In [None]:
#Checking MasVnrArea column
train_data['MasVnrArea'].unique()

In [None]:
#Repalcing Null value with Zero(0)
train_data['MasVnrArea']=train_data['MasVnrArea'].fillna(0)

In [None]:
#Checking MasVnrType Column
train_data['MasVnrType'].unique()

In [None]:
#Replacing Null value with Others as there already is a value called None
train_data['MasVnrType']=train_data['MasVnrArea'].fillna("Others")

In [None]:
#Checking Electrical column
train_data['Electrical'].unique()

In [None]:
#Replacing the Null value with None
train_data['Electrical']=train_data['Electrical'].fillna('None')

**Correlation helps to discover whether different data columns have linear relationship or not. If the correlation value is closer to 1 then those data column are positively related.**

Visualizing the Correlation in HeatMap makes it easier to point out which data columns are positively related to SalesPrice

In [None]:
#Calculating Correlation 
correlation=train_data.corr()
Positive_Related=correlation.index[abs(correlation["SalePrice"])>0.4]

#Plotting the Correlation in HeatMap for the data columns which has correlation value more than 0.4
plt.figure(figsize=(12,12))
Corr_Heatmap=sns.heatmap(train_data[Positive_Related].corr(),annot=True,cmap="GnBu")

*Checking for Outliers*

Here checking will be conducted,if there are any outliers in the data columns: OverallQual and GrLiveArea, as they are the data columns that have high correlation value than the rest in the dataset.

Box Plot and Scatterplot are most used tools to check if there are outliers in the data column.

In [None]:
#BoxPlot for OverallQual Column
plt.figure(figsize=(12,12))
sns.boxplot(x='OverallQual',y='SalePrice',data=train_data)

In [None]:
#ScatterPlot for GrtLivArea
plt.figure(figsize=(12,12))
sns.scatterplot(x='GrLivArea',y='SalePrice',data=train_data)

In [None]:
#Deleting Outliers
train_data=train_data.drop(train_data[(train_data['GrLivArea']>4000)&(train_data['SalePrice']>300000)].index)

**Converting Numerical Variables that are actually Categorical**

In [None]:
#MSSubClass=The building class
train_data['MSSubClass'] = train_data['MSSubClass'].astype(str)

#Changing OverallCond into a categorical variable
train_data['OverallCond'] = train_data['OverallCond'].astype(str)

#Year is transformed into categorical features.
train_data['YrSold'] = train_data['YrSold'].astype(str)
train_data['MoSold'] = train_data['MoSold'].astype(str)

**Label Encoding the Categorical Columns**

In [None]:
encoder=preprocessing.LabelEncoder()
train_data['MSSubClass'] = encoder.fit_transform(train_data['MSSubClass'])
train_data['Street'] = encoder.fit_transform(train_data['Street'])
train_data['LotShape'] = encoder.fit_transform(train_data['LotShape'])
train_data['LandSlope'] = encoder.fit_transform(train_data['LandSlope'])
train_data['ExterQual'] = encoder.fit_transform(train_data['ExterQual'])
train_data['ExterCond'] = encoder.fit_transform(train_data['ExterCond'])
train_data['BsmtQual'] = encoder.fit_transform(train_data['BsmtQual'])
train_data['BsmtCond'] = encoder.fit_transform(train_data['BsmtCond'])
train_data['BsmtExposure'] = encoder.fit_transform(train_data['BsmtExposure'])
train_data['BsmtFinType1'] = encoder.fit_transform(train_data['BsmtFinType1'])
train_data['BsmtFinType2'] = encoder.fit_transform(train_data['BsmtFinType2'])
train_data['HeatingQC'] = encoder.fit_transform(train_data['HeatingQC'])
train_data['CentralAir'] = encoder.fit_transform(train_data['CentralAir'])
train_data['KitchenQual'] = encoder.fit_transform(train_data['KitchenQual'])
train_data['Functional'] = encoder.fit_transform(train_data['Functional'])
train_data['FireplaceQu'] = encoder.fit_transform(train_data['FireplaceQu'])
train_data['GarageFinish'] = encoder.fit_transform(train_data['GarageFinish'])
train_data['GarageQual'] = encoder.fit_transform(train_data['GarageQual'])
train_data['GarageCond'] = encoder.fit_transform(train_data['GarageCond'])
train_data['PavedDrive'] = encoder.fit_transform(train_data['PavedDrive'])
train_data['MSZoning'] = encoder.fit_transform(train_data['MSZoning'])
train_data['LandContour'] = encoder.fit_transform(train_data['LandContour'])
train_data['Utilities'] = encoder.fit_transform(train_data['Utilities'])
train_data['LotConfig'] = encoder.fit_transform(train_data['LotConfig'])
train_data['Neighborhood'] = encoder.fit_transform(train_data['Neighborhood'])
train_data['Condition1'] = encoder.fit_transform(train_data['Condition1'])
train_data['Condition2'] = encoder.fit_transform(train_data['Condition2'])
train_data['BldgType'] = encoder.fit_transform(train_data['BldgType'])
train_data['HouseStyle'] = encoder.fit_transform(train_data['HouseStyle'])
train_data['OverallCond'] = encoder.fit_transform(train_data['OverallCond'])
train_data['RoofStyle'] = encoder.fit_transform(train_data['RoofStyle'])
train_data['RoofMatl'] = encoder.fit_transform(train_data['RoofMatl'])
train_data['Exterior1st'] = encoder.fit_transform(train_data['Exterior1st'])
train_data['Exterior2nd'] = encoder.fit_transform(train_data['Exterior2nd'])
train_data['Foundation'] = encoder.fit_transform(train_data['Foundation'])
train_data['Heating'] = encoder.fit_transform(train_data['Heating'])
train_data['Electrical'] = encoder.fit_transform(train_data['Electrical'])
train_data['GarageType'] = encoder.fit_transform(train_data['GarageType'])
train_data['MoSold'] = encoder.fit_transform(train_data['MoSold'])
train_data['YrSold'] = encoder.fit_transform(train_data['YrSold'])
train_data['SaleType'] = encoder.fit_transform(train_data['SaleType'])
train_data['SaleCondition'] = encoder.fit_transform(train_data['SaleCondition'])

In [None]:
train_data.head()

# Preparing Test Data

In [None]:
#Dropping Columns because bescause they were dropped in Training data
test_id=test_data['Id']
test_data=test_data.drop(['Id','Alley','PoolQC','MiscFeature','Fence'],axis=1)

In [None]:
#Converting String into Numbers
test_data=test_data.apply(lambda x:pd.factorize(x)[0])

In [None]:
#Converting String into Number
new_train_data=train_data.apply(lambda x:pd.factorize(x)[0])
test_data=test_data.apply(lambda x:pd.factorize(x)[0])

# Preparing For Modelling

In [None]:
#Allocating training set and testing set
X_train=new_train_data.drop('SalePrice',axis=1)
Y_train=train_data['SalePrice']
X_test=test_data
X_train.shape, Y_train.shape, X_test.shape

#  Building and Training the Model,Predicting and Displaying the Output

**Random Forest Regressor**

In [None]:
#Building and Training the Model and Displaying the Score of the Model
Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',RandomForestRegressor(n_estimators=50))]
pipe=Pipeline(Input)
pipe
pipe.fit(X_train,Y_train)
pipe_pred_RFF = pipe.predict(X_test)
print(pipe.score(X_train,Y_train))

In [None]:
#Creating Dataframe to store the Ids with Prediction
output=pd.DataFrame({'Id':test_id,'SalePrice':pipe_pred_RFF})
print(output)

**Linear Regressor**

In [None]:
#Building and Training the Model and Displaying the Score of the Model
InputLR=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LinearRegression())]
pipeLR=Pipeline(InputLR)
pipeLR
pipeLR.fit(X_train,Y_train)
pipe_pred_LR = pipeLR.predict(X_test)
print(pipeLR.score(X_train,Y_train))

In [None]:
#Creating Dataframe to store the Ids with Prediction
linear_output=pd.DataFrame({'Id':test_id,'SalePrice':pipe_pred_LR})
print(linear_output)

**Lasso regression**

In [None]:
#Building and Training the Model and Displaying the Score of the Model
InputLassoR=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',Lasso(alpha=1.0))]
pipeLassoR=Pipeline(InputLassoR)
pipeLassoR
pipeLassoR.fit(X_train,Y_train)
pipe_pred_LassoR = pipeLassoR.predict(X_test)
print(pipeLassoR.score(X_train,Y_train))

#Creating dataframe to store ID with prediction
lasso_output=pd.DataFrame({'Id':test_id,'SalePrice':pipe_pred_LassoR})
print(lasso_output)

**Ridge Regression**

# Using Deep Learning to Predict 

**Building Model**

In [None]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

**Define a checkpoint callback**

In [None]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

**Training Model**

In [None]:
NN_model.fit(X_train, Y_train, epochs=500, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)

**Predicting the Streams**

In [None]:
DNN_predict=NN_model.predict(X_test)

In [None]:
#Creating dataframe to store ID with prediction
DNN_output=pd.DataFrame({'Id':test_id,'SalePrice':(DNN_predict[:,0])})
print(DNN_output)

# Storing the Prediction into a CSV File

In [None]:
linear_output.to_csv("House_Price_Prediction.csv",index=False)
print("Completed")