## Project- Linear Regression Model to Predict House Prices

##### by Sarthak Shukla

### 1. Preprocessing

Importing Required Libraries

In [1]:
# Importing all the packages:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LinearRegression

Importing the Datasets

In [2]:
# Data import
dtrain = pd.read_csv('train.csv')
dtest = pd.read_csv('test.csv')
dtestID = dtest['Id']

data = pd.concat([dtrain.drop('SalePrice', axis=1), dtest], keys=['train', 'test'])
data.drop(['Id','Alley','PoolQC','Fence','MiscFeature','Utilities'], axis=1, inplace=True)

Grouping Data in term of its types

In [3]:
years = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
metrics = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
         '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 
         'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']

In [4]:
# Numerical features
num_feats = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
             'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'ExterCond', 
             'BsmtQual', 'BsmtCond', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
             'HeatingQC', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
             'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
             'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
             'Fireplaces', 'FireplaceQu', 'GarageYrBlt',
             'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
             'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
             'ScreenPorch', 'PoolArea', 'MiscVal',
             'YrSold']    

# We need to convert literal grades to a numerical scale
grades = ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
          'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']
literal = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
num = [9, 7, 5, 3, 2]
G = dict(zip(literal, num))

data[grades] = data[grades].replace(G)

# Categorical features: everything that is not 'numerical'
cat_feats = data.drop(num_feats, axis=1).columns

Viewing the train & test datasets

In [5]:
dtrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [6]:
dtrain.shape

(1460, 81)

In [7]:
dtrain.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
dtest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

In [9]:
dtest.shape

(1459, 80)

In [10]:
dtest.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [11]:
#log transform the target:
price = np.log1p(dtrain['SalePrice'])

#log transform skewed continuous numerical features:
skewed_feats = data.loc['train'][metrics].apply(lambda x: x.skew(skipna=True)) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

data[skewed_feats] = np.log1p(data[skewed_feats])

  if __name__ == '__main__':


#### Missing Values

In [12]:
data.isnull().sum().sum() #Checking the number of Missing Values

3171

Changing every empty cell to mode of every numerical columns

In [13]:
data.loc[data['MSZoning'].isnull(),'MSZoning'] = data['MSZoning'].mode()[0]
data.loc[data['LotFrontage'].isnull(),'LotFrontage'] = data['LotFrontage'].mode()[0]
data.loc[data['Exterior1st'].isnull(),'Exterior1st'] = data['Exterior1st'].mode()[0]
data.loc[data['Exterior2nd'].isnull(),'Exterior2nd'] = data['Exterior2nd'].mode()[0]
data.loc[data['MasVnrType'].isnull(),'MasVnrType'] = data['MasVnrType'].mode()[0]
data.loc[data['BsmtQual'].isnull(),'BsmtQual'] = data['BsmtQual'].mode()[0]
data.loc[data['BsmtCond'].isnull(),'BsmtCond'] = data['BsmtCond'].mode()[0]
data.loc[data['BsmtExposure'].isnull(),'BsmtExposure'] = data['BsmtExposure'].mode()[0]
data.loc[data['BsmtFinType1'].isnull(),'BsmtFinType1'] = data['BsmtFinType1'].mode()[0]
data.loc[data['BsmtFinType2'].isnull(),'BsmtFinType2'] = data['BsmtFinType2'].mode()[0]
data.loc[data['Electrical'].isnull(),'Electrical'] = data['Electrical'].mode()[0]
data.loc[data['KitchenQual'].isnull(),'KitchenQual'] = data['KitchenQual'].mode()[0]
data.loc[data['Functional'].isnull(),'Functional'] = data['Functional'].mode()[0]
data.loc[data['GarageType'].isnull(),'GarageType'] = data['GarageType'].mode()[0]
data.loc[data['GarageFinish'].isnull(),'GarageFinish'] = data['GarageFinish'].mode()[0]
data.loc[data['GarageQual'].isnull(),'GarageQual'] = data['GarageQual'].mode()[0]
data.loc[data['GarageCond'].isnull(),'GarageCond'] = data['GarageCond'].mode()[0]
data.loc[data['SaleType'].isnull(),'SaleType'] = data['SaleType'].mode()[0]
data.loc[data['MasVnrArea'].isnull(),'MasVnrArea'] = data['MasVnrArea'].mode()[0]
data.loc[data['TotalBsmtSF'].isnull(),'TotalBsmtSF'] = data['TotalBsmtSF'].mode()[0]
data.loc[data['BsmtFinSF1'].isnull(),'BsmtFinSF1'] = data['BsmtFinSF1'].mode()[0]
data.loc[data['BsmtFinSF2'].isnull(),'BsmtFinSF2'] =data['BsmtFinSF2'].mode()[0]
data.loc[data['BsmtUnfSF'].isnull(),'BsmtUnfSF'] = data['BsmtUnfSF'].mode()[0]
data.loc[data['BsmtFullBath'].isnull(),'BsmtFullBath'] = data['BsmtFullBath'].mode()[0]
data.loc[data['BsmtHalfBath'].isnull(),'BsmtHalfBath'] = data['BsmtHalfBath'].mode()[0]
data.loc[data['GarageArea'].isnull(),'GarageArea'] = data['GarageArea'].mode()[0]
data.loc[data['GarageCars'].isnull(),'GarageCars'] = data['GarageCars'].mode()[0]
data.loc[data['GarageYrBlt'].isnull(),'GarageYrBlt'] = data['GarageYrBlt'].mode()[0]

In [14]:
data.isnull().sum()[data.isnull().sum() > 0]

FireplaceQu    1420
dtype: int64

In [15]:
data.drop(['FireplaceQu'], axis=1, inplace=True) #dropping the FireplaceQu column

Now, there are no missing values

In [16]:
data.isnull().sum().sum()

0

Correcting the datatypes of certain columns

In [17]:
data['MSSubClass'] = data['MSSubClass'].astype('object', copy=False)
data['MoSold'] = data['MoSold'].astype('object', copy=False)
data['BsmtFullBath'] = data['BsmtFullBath'].astype('int64', copy=False)
data['BsmtHalfBath'] = data['BsmtHalfBath'].astype('int64', copy=False)
data['GarageCars'] = data['GarageCars'].astype('int64', copy=False)
data[years] = data[years].astype('int64', copy=False)

Grouping Categorical Data

In [18]:
categorical_data = pd.concat((data.loc['train'][cat_feats], price), axis=1)

In [19]:
categorical_data

Unnamed: 0,MSSubClass,MSZoning,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,CentralAir,Electrical,Functional,GarageType,GarageFinish,PavedDrive,MoSold,SaleType,SaleCondition,SalePrice
0,60,RL,Pave,Reg,Lvl,Inside,Gtl,CollgCr,Norm,Norm,...,Y,SBrkr,Typ,Attchd,RFn,Y,2,WD,Normal,12.247699
1,20,RL,Pave,Reg,Lvl,FR2,Gtl,Veenker,Feedr,Norm,...,Y,SBrkr,Typ,Attchd,RFn,Y,5,WD,Normal,12.109016
2,60,RL,Pave,IR1,Lvl,Inside,Gtl,CollgCr,Norm,Norm,...,Y,SBrkr,Typ,Attchd,RFn,Y,9,WD,Normal,12.317171
3,70,RL,Pave,IR1,Lvl,Corner,Gtl,Crawfor,Norm,Norm,...,Y,SBrkr,Typ,Detchd,Unf,Y,2,WD,Abnorml,11.849405
4,60,RL,Pave,IR1,Lvl,FR2,Gtl,NoRidge,Norm,Norm,...,Y,SBrkr,Typ,Attchd,RFn,Y,12,WD,Normal,12.429220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,Pave,Reg,Lvl,Inside,Gtl,Gilbert,Norm,Norm,...,Y,SBrkr,Typ,Attchd,RFn,Y,8,WD,Normal,12.072547
1456,20,RL,Pave,Reg,Lvl,Inside,Gtl,NWAmes,Norm,Norm,...,Y,SBrkr,Min1,Attchd,Unf,Y,2,WD,Normal,12.254868
1457,70,RL,Pave,Reg,Lvl,Inside,Gtl,Crawfor,Norm,Norm,...,Y,SBrkr,Typ,Attchd,RFn,Y,5,WD,Normal,12.493133
1458,20,RL,Pave,Reg,Lvl,Inside,Gtl,NAmes,Norm,Norm,...,Y,FuseA,Typ,Attchd,Unf,Y,4,WD,Normal,11.864469


In [20]:
low = 0.05 * data.loc['train'].shape[0] # at least 5% of the dataset should have this value

for feat in cat_feats:        
    # we will group the categories based on the average sale price
    order = ((categorical_data.groupby(feat).mean()).sort_values(by='SalePrice', 
                                                      ascending=False).index.values.tolist())
    for i in range(0, len(order)):
        N = (categorical_data[categorical_data[feat] == order[i]]
             .count().max())
        j = i
        while (N < low) & (N != 0):
            j += 1

            if (j > len(order) - 1):
                # if reached the end of list, go back to last
                # useful category of the 'order' list
                j = i - 1
                break
            else: 
                N += (categorical_data[categorical_data[feat] == order[j]]
                      .count().max())
        if j < i:
            lim = len(order)
        else:
            lim = j

        for k in range(i, lim):
            categorical_data.replace({feat: {order[k]: order[j]}},
                                 inplace=True)
            data.replace({feat: {order[k]: order[j]}},
                                     inplace=True)            
    uniD = data[feat].unique()
    order = categorical_data[feat].unique()

    for i in uniD:
        if i not in order:
            ind = np.argsort(order - i)[0]
            data.replace({feat: {i: order[ind]}}, inplace=True)

In [21]:
data.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPo

In [22]:
# Remove columns with just one category
for feat in categorical_data.columns[:-1]:    
    uni = categorical_data.groupby(feat).mean().sort_values(by='SalePrice').index
    if (len(uni) < 2):
            data.drop(feat, axis=1, inplace=True)
    elif len(uni) < 3:
        print("{}: {}".format(feat, uni))
        data[feat].replace({uni[0]: 0, uni[1]: 1}, inplace=True)
        data[feat] = data[feat].astype('int8')
    else:
        data[feat] = data[feat].astype('category')
        

MSZoning: Index(['RM', 'RL'], dtype='object', name='MSZoning')
LotShape: Index(['Reg', 'IR1'], dtype='object', name='LotShape')
LandContour: Index(['Lvl', 'Low'], dtype='object', name='LandContour')
LandSlope: Index(['Gtl', 'Mod'], dtype='object', name='LandSlope')
Condition1: Index(['Feedr', 'Norm'], dtype='object', name='Condition1')
RoofStyle: Index(['Gable', 'Hip'], dtype='object', name='RoofStyle')
BsmtFinType2: Index(['Rec', 'Unf'], dtype='object', name='BsmtFinType2')
CentralAir: Index(['N', 'Y'], dtype='object', name='CentralAir')
Electrical: Index(['FuseA', 'SBrkr'], dtype='object', name='Electrical')
Functional: Index(['Min2', 'Typ'], dtype='object', name='Functional')
PavedDrive: Index(['N', 'Y'], dtype='object', name='PavedDrive')
SaleType: Index(['WD', 'New'], dtype='object', name='SaleType')


Changing Categorical Data to Numerical by One-Hot Encoding (get_dummies)

In [23]:
final = pd.get_dummies(data)

In [24]:
final.shape

(2919, 128)

Creating final test and train datasets and rescaling them using Standardization

In [25]:
# Training/testing sets
X_test = final.loc['test']
X_train = final.loc['train']

y_train = price

In [26]:
m = X_train.mean()
std = X_train.std()

X_train = (X_train - m) / std
X_test = (X_test - m) / std

### 2. Linear Regression

Training the model

In [27]:
# Create linear regression object
LR = LinearRegression()

# Train the model using the training sets
LR.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Checking the goodness of model

In [28]:
LR.score(X_train,y_train)

0.9174721314964561

Saving the predicted values as "predicted.csv"

In [29]:
submit = pd.DataFrame({'Id': dtestID, 'SalePrice': np.exp(LR.predict(X_test))})
submit.to_csv('predicted.csv', index=False)