In [2]:
import pandas as pd

import numpy as np

import warnings

warnings.filterwarnings('ignore')

In [3]:
house_price_df = pd.read_csv('House_Price_Data.csv')

house_price_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
Y = house_price_df['SalePrice']

house_price_df.drop(columns = 'SalePrice', inplace=True)

In [5]:
na_df = pd.DataFrame({'Column_Name':house_price_df.columns,
                      'Missing_Val_Count':house_price_df.isnull().sum(),
                      'Missing_Val_Per':100*(house_price_df.isnull().sum()/house_price_df.shape[0])})

na_df.sort_values(by='Missing_Val_Per',ascending = False)

Unnamed: 0,Column_Name,Missing_Val_Count,Missing_Val_Per
PoolQC,PoolQC,1453,99.520548
MiscFeature,MiscFeature,1406,96.301370
Alley,Alley,1369,93.767123
Fence,Fence,1179,80.753425
FireplaceQu,FireplaceQu,690,47.260274
...,...,...,...
TotalBsmtSF,TotalBsmtSF,0,0.000000
Heating,Heating,0,0.000000
MSSubClass,MSSubClass,0,0.000000
CentralAir,CentralAir,0,0.000000


In [6]:
#Dropping unnecessary columns

cols_to_drop = list(na_df[na_df['Missing_Val_Per']>50].index)

for col in house_price_df.columns:
    if len(house_price_df[col].unique())==1 or len(house_price_df[col].unique())==house_price_df.shape[0]:
        cols_to_drop.append(col)
cols_to_drop

['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id']

In [7]:
house_price_df.drop(columns = cols_to_drop, inplace = True)

In [8]:
num_cols = [col for col in house_price_df.columns if house_price_df[col].dtype =='int64' or house_price_df[col].dtype=='float64']

cat_cols = [col for col in house_price_df.columns if house_price_df[col].dtype == 'object']

# Train Test Split

In [9]:
#Train Test Split

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(house_price_df,Y,test_size=0.2,random_state=42)

In [10]:
for col in num_cols:
    x_train[col] = x_train[col].fillna(x_train[col].mean())
    x_test[col] = x_test[col].fillna(x_train[col].mean()) #train mean is used becasue you shouln't use test data to train your model/it should not be leaked in any way

# Scaling Continuous Variables

In [11]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [12]:
minmaxscaler = MinMaxScaler()   #MinMaxScaler() has been replaced with StandardScaler as the model was not predicting appropriate numbers for sale proce (-ve's were being predicted), hence it is always a trial and error method.

#if minmaxscaler doesn't work properly, you can use standardscaler. In this case minmax is fine. Ignore the above comment

for col in num_cols:
    x_train[col] = minmaxscaler.fit_transform(np.array(x_train[col]).reshape(-1,1))
    x_test[col] = minmaxscaler.transform(np.array(x_test[col]).reshape(-1,1))

# One Hot Encoding Categorical Columns

In [13]:
oe_train_df = pd.get_dummies(x_train[cat_cols])
oe_test_df = pd.get_dummies(x_test[cat_cols])

In [14]:
#This is to have equal number of columns in X_train and x_test as you cannot feed unequal number of columns to the model.(Gives an error otherwise)

x_train_oe,x_test_oe = oe_train_df.align(oe_test_df,join='inner',axis=1,fill_value=0)

In [15]:
#Adding both con and cat columns so that we can feed it to the model

x_train_final = pd.concat([x_train_oe,x_train[num_cols]],axis=1)
x_test_final = pd.concat([x_test_oe,x_test[num_cols]],axis=1)

In [16]:
x_train_final

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,LotShape_IR1,LotShape_IR2,LotShape_IR3,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
254,0,0,0,1,0,0,1,0,0,0,...,0.207334,0.291715,0.000000,0.000000,0.0,0.0,0.0,0.0,0.454545,1.00
1066,0,0,0,1,0,0,1,1,0,0,...,0.267983,0.000000,0.073126,0.000000,0.0,0.0,0.0,0.0,0.363636,0.75
638,0,0,0,1,0,0,1,0,0,0,...,0.000000,0.382730,0.000000,0.297101,0.0,0.0,0.0,0.0,0.363636,0.50
799,0,0,0,1,0,0,1,0,0,0,...,0.169252,0.000000,0.000000,0.478261,0.0,0.0,0.0,0.0,0.454545,0.25
380,0,0,0,1,0,0,1,0,0,0,...,0.217207,0.000000,0.000000,0.438406,0.0,0.0,0.0,0.0,0.363636,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0,0,0,1,0,0,1,1,0,0,...,0.310296,0.000000,0.040219,0.000000,0.0,0.0,0.0,0.0,0.181818,0.25
1130,0,0,0,1,0,0,1,0,0,0,...,0.406206,0.502917,0.080439,0.000000,0.0,0.0,0.0,0.0,1.000000,0.75
1294,0,0,0,1,0,0,1,0,0,0,...,0.403385,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.272727,0.00
860,0,0,0,1,0,0,1,0,0,0,...,0.152327,0.000000,0.438757,0.000000,0.0,0.0,0.0,0.0,0.454545,0.25


# Training a Linear Regression Model

In [17]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score

linreg = LinearRegression()

linreg.fit(x_train_final,y_train)

predictions = linreg.predict(x_test_final)

In [18]:
r2_score(y_test,predictions)

0.8752850303337805