In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import sklearn.linear_model as lm_model
from sklearn import metrics
from sklearn.model_selection  import train_test_split
from sklearn.model_selection  import cross_val_score
from sklearn.model_selection  import KFold
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# Read Dataset
House_Sale = pd.read_csv("..../ASS05_Data.csv")

In [3]:
#View Dataset
House_Sale

Unnamed: 0,LotArea,TotalBsmtSF,GarageCars,SalePrice,AGE,TotalArea
0,12108,1440,0,118000,53,1440
1,9400,945,0,127500,31,980
2,1680,672,0,91500,35,1218
3,8430,1040,0,124000,31,1040
4,7500,1080,0,141000,4,1080
...,...,...,...,...,...,...
1455,8100,849,2,106000,111,2138
1456,9142,1020,1,137000,96,1928
1457,45600,907,2,240000,100,2358
1458,10320,756,1,135000,102,1469


In [4]:
House_Sale.shape  # Number of rows and columns

(1460, 6)

In [5]:
House_Sale.info() # Describe column names and its types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 6 columns):
LotArea        1460 non-null int64
TotalBsmtSF    1460 non-null int64
GarageCars     1460 non-null int64
SalePrice      1460 non-null int64
AGE            1460 non-null int64
TotalArea      1460 non-null int64
dtypes: int64(6)
memory usage: 68.6 KB


In [6]:
House_Sale.describe()

Unnamed: 0,LotArea,TotalBsmtSF,GarageCars,SalePrice,AGE,TotalArea
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,1057.429452,1.767123,180921.19589,36.547945,1509.619178
std,9981.264932,438.705324,0.747315,79442.502883,30.250152,521.163523
min,1300.0,0.0,0.0,34900.0,0.0,334.0
25%,7553.5,795.75,1.0,129975.0,8.0,1123.75
50%,9478.5,991.5,2.0,163000.0,35.0,1458.0
75%,11601.5,1298.25,2.0,214000.0,54.0,1775.25
max,215245.0,6110.0,4.0,755000.0,136.0,5642.0


In [7]:
House_Sale = House_Sale[['LotArea', 'TotalBsmtSF', 'GarageCars', 'AGE', 'TotalArea', 'SalePrice']]

In [8]:
House_Sale.columns = [['LotArea', 'TotalBsmtSF', 'GarageCars', 'AGE', 'TotalArea', 'TrueTarget']]

In [9]:
House_Sale

Unnamed: 0,LotArea,TotalBsmtSF,GarageCars,AGE,TotalArea,TrueTarget
0,12108,1440,0,53,1440,118000
1,9400,945,0,31,980,127500
2,1680,672,0,35,1218,91500
3,8430,1040,0,31,1040,124000
4,7500,1080,0,4,1080,141000
...,...,...,...,...,...,...
1455,8100,849,2,111,2138,106000
1456,9142,1020,1,96,1928,137000
1457,45600,907,2,100,2358,240000
1458,10320,756,1,102,1469,135000


# Modelling Assignmnet Part I: The Validation Set Approach

In [10]:
# Split data in 70% as Train and 30% as Test
train_House_Sale = House_Sale.sample(1022, random_state = 1)
test_House_Sale = House_Sale[~House_Sale.isin(train_House_Sale)].dropna(how = 'all')

#Divide train and test data in X and Y to build model
X_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,:-1])
y_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,-1])
X_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,:-1])
y_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,-1])

print("Split I")
print()

# Linear Regression Model on Data Spilt I
Linear_model = sm.OLS(y_train_House_Sale, X_train_House_Sale).fit()
print(Linear_model.summary())
print()
print()

print("Accuracy of Model = ", Linear_model.rsquared * 100)
print()

print("MSE of Training Model = ", metrics.mean_squared_error(y_train_House_Sale,Linear_model.predict(X_train_House_Sale)))
print("ASE of Validation Set = ", metrics.mean_squared_error(y_test_House_Sale, Linear_model.predict(X_test_House_Sale)))
print()

LinearReg = lm_model.LinearRegression()

# Quadratic Model on Data Spilt I
poly = PolynomialFeatures(degree=2)
X_train_House_Sale2 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale2 = poly.fit_transform(X_test_House_Sale)
Quadratic_model = LinearReg.fit(X_train_House_Sale2, y_train_House_Sale)
print("MSE of Quadratic Model = ", metrics.mean_squared_error(y_test_House_Sale, Quadratic_model.predict(X_test_House_Sale2)))

# Cubic Model on Data Spilt I
poly = PolynomialFeatures(degree=3)
X_train_House_Sale3 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale3 = poly.fit_transform(X_test_House_Sale)
Cubic_model = LinearReg.fit(X_train_House_Sale3, y_train_House_Sale)
print("MSE of Cubic Model = ", metrics.mean_squared_error(y_test_House_Sale, Cubic_model.predict(X_test_House_Sale3)))

Split I

                                 OLS Regression Results                                
Dep. Variable:             TrueTarget   R-squared (uncentered):                   0.964
Model:                            OLS   Adj. R-squared (uncentered):              0.964
Method:                 Least Squares   F-statistic:                              5518.
Date:                Wed, 11 Mar 2020   Prob (F-statistic):                        0.00
Time:                        08:44:30   Log-Likelihood:                         -12199.
No. Observations:                1022   AIC:                                  2.441e+04
Df Residuals:                    1017   BIC:                                  2.443e+04
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [11]:
# Split data in 70% as Train and 30% as Test
train_House_Sale = House_Sale.sample(1022, random_state = 2)
test_House_Sale = House_Sale[~House_Sale.isin(train_House_Sale)].dropna(how = 'all')

#Divide train and test data in X and Y to build model
X_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,:-1])
y_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,-1])
iX_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,:-1])
y_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,-1])

print("Split II")
print()

# Linear Regression Model on Data Spilt II
Linear_model = sm.OLS(y_train_House_Sale, X_train_House_Sale).fit()
print(Linear_model.summary())
print()
print()

print("Accuracy of Model = ", Linear_model.rsquared * 100)
print()

print("MSE of Training Model = ", metrics.mean_squared_error(y_train_House_Sale,Linear_model.predict(X_train_House_Sale)))
print("ASE of Validation Set = ", metrics.mean_squared_error(y_test_House_Sale, Linear_model.predict(X_test_House_Sale)))
print()

LinearReg = lm_model.LinearRegression()

# Quadratic Model on Data Spilt II
poly = PolynomialFeatures(degree=2)
X_train_House_Sale2 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale2 = poly.fit_transform(X_test_House_Sale)
Quadratic_model = LinearReg.fit(X_train_House_Sale2, y_train_House_Sale)
print("MSE of Quadratic Model = ", metrics.mean_squared_error(y_test_House_Sale, Quadratic_model.predict(X_test_House_Sale2)))

# Cubic Model on Data Spilt II
poly = PolynomialFeatures(degree=3)
X_train_House_Sale3 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale3 = poly.fit_transform(X_test_House_Sale)
Cubic_model = LinearReg.fit(X_train_House_Sale3, y_train_House_Sale)
print("MSE of Cubic Model = ", metrics.mean_squared_error(y_test_House_Sale, Cubic_model.predict(X_test_House_Sale3)))

Split II

                                 OLS Regression Results                                
Dep. Variable:             TrueTarget   R-squared (uncentered):                   0.960
Model:                            OLS   Adj. R-squared (uncentered):              0.960
Method:                 Least Squares   F-statistic:                              4886.
Date:                Wed, 11 Mar 2020   Prob (F-statistic):                        0.00
Time:                        08:44:34   Log-Likelihood:                         -12273.
No. Observations:                1022   AIC:                                  2.456e+04
Df Residuals:                    1017   BIC:                                  2.458e+04
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [12]:
# Split data in 70% as Train and 30% as Test
train_House_Sale = House_Sale.sample(1022, random_state = 3)
test_House_Sale = House_Sale[~House_Sale.isin(train_House_Sale)].dropna(how = 'all')

#Divide train and test data in X and Y to build model
X_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,:-1])
y_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,-1])
X_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,:-1])
y_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,-1])

print("Split III")
print()

# Linear Regression Model on Data Spilt III
Linear_model = sm.OLS(y_train_House_Sale, X_train_House_Sale).fit()
print(Linear_model.summary())
print()
print()

print("Accuracy of Model = ", Linear_model.rsquared * 100)
print()

print("MSE of Training Model = ", metrics.mean_squared_error(y_train_House_Sale,Linear_model.predict(X_train_House_Sale)))
print("ASE of Validation Set = ", metrics.mean_squared_error(y_test_House_Sale, Linear_model.predict(X_test_House_Sale)))
print()

LinearReg = lm_model.LinearRegression()

# Quadratic Model on Data Spilt III
poly = PolynomialFeatures(degree=2)
X_train_House_Sale2 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale2 = poly.fit_transform(X_test_House_Sale)
Quadratic_model = LinearReg.fit(X_train_House_Sale2, y_train_House_Sale)
print("MSE of Quadratic Model = ", metrics.mean_squared_error(y_test_House_Sale, Quadratic_model.predict(X_test_House_Sale2)))

# Cubic Model on Data Spilt III
poly = PolynomialFeatures(degree=3)
X_train_House_Sale3 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale3 = poly.fit_transform(X_test_House_Sale)
Cubic_model = LinearReg.fit(X_train_House_Sale3, y_train_House_Sale)
print("MSE of Cubic Model = ", metrics.mean_squared_error(y_test_House_Sale, Cubic_model.predict(X_test_House_Sale3)))

Split III

                                 OLS Regression Results                                
Dep. Variable:             TrueTarget   R-squared (uncentered):                   0.953
Model:                            OLS   Adj. R-squared (uncentered):              0.953
Method:                 Least Squares   F-statistic:                              4140.
Date:                Wed, 11 Mar 2020   Prob (F-statistic):                        0.00
Time:                        08:44:38   Log-Likelihood:                         -12331.
No. Observations:                1022   AIC:                                  2.467e+04
Df Residuals:                    1017   BIC:                                  2.470e+04
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------

In [13]:
# Split data in 70% as Train and 30% as Test
train_House_Sale = House_Sale.sample(1022, random_state = 4)
test_House_Sale = House_Sale[~House_Sale.isin(train_House_Sale)].dropna(how = 'all')

#Divide train and test data in X and Y to build model
X_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,:-1])
y_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,-1])
X_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,:-1])
y_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,-1])

print("Split IV")
print()

# Linear Regression Model on Data Spilt IV
Linear_model = sm.OLS(y_train_House_Sale, X_train_House_Sale).fit()
print(Linear_model.summary())
print()
print()

print("Accuracy of Model = ", Linear_model.rsquared * 100)
print()

print("MSE of Training Model = ", metrics.mean_squared_error(y_train_House_Sale,Linear_model.predict(X_train_House_Sale)))
print("ASE of Validation Set = ", metrics.mean_squared_error(y_test_House_Sale, Linear_model.predict(X_test_House_Sale)))
print()

LinearReg = lm_model.LinearRegression()

# Quadratic Model on Data Spilt IV
poly = PolynomialFeatures(degree=2)
X_train_House_Sale2 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale2 = poly.fit_transform(X_test_House_Sale)
Quadratic_model = LinearReg.fit(X_train_House_Sale2, y_train_House_Sale)
print("MSE of Quadratic Model = ", metrics.mean_squared_error(y_test_House_Sale, Quadratic_model.predict(X_test_House_Sale2)))

# Cubic Model on Data Spilt IV
poly = PolynomialFeatures(degree=3)
X_train_House_Sale3 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale3 = poly.fit_transform(X_test_House_Sale)
Cubic_model = LinearReg.fit(X_train_House_Sale3, y_train_House_Sale)
print("MSE of Cubic Model = ", metrics.mean_squared_error(y_test_House_Sale, Cubic_model.predict(X_test_House_Sale3)))

Split IV

                                 OLS Regression Results                                
Dep. Variable:             TrueTarget   R-squared (uncentered):                   0.951
Model:                            OLS   Adj. R-squared (uncentered):              0.951
Method:                 Least Squares   F-statistic:                              3944.
Date:                Wed, 11 Mar 2020   Prob (F-statistic):                        0.00
Time:                        08:44:42   Log-Likelihood:                         -12365.
No. Observations:                1022   AIC:                                  2.474e+04
Df Residuals:                    1017   BIC:                                  2.477e+04
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [14]:
# Split data in 70% as Train and 30% as Test
train_House_Sale = House_Sale.sample(1022, random_state = 5)
test_House_Sale = House_Sale[~House_Sale.isin(train_House_Sale)].dropna(how = 'all')

#Divide train and test data in X and Y to build model
X_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,:-1])
y_train_House_Sale = pd.DataFrame(train_House_Sale.iloc[:,-1])
X_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,:-1])
y_test_House_Sale = pd.DataFrame(test_House_Sale.iloc[:,-1])

print("Split V")
print()

# Linear Regression Model on Data Spilt V
Linear_model = sm.OLS(y_train_House_Sale, X_train_House_Sale).fit()
print(Linear_model.summary())
print()
print()

print("Accuracy of Model = ", Linear_model.rsquared * 100) 
print()

print("MSE of Training Model = ", metrics.mean_squared_error(y_train_House_Sale,Linear_model.predict(X_train_House_Sale)))
print("ASE of Validation Set = ", metrics.mean_squared_error(y_test_House_Sale, Linear_model.predict(X_test_House_Sale)))
print()

LinearReg = lm_model.LinearRegression()

# Quadratic Model on Data Spilt V
poly = PolynomialFeatures(degree=2)
X_train_House_Sale2 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale2 = poly.fit_transform(X_test_House_Sale)
Quadratic_model = LinearReg.fit(X_train_House_Sale2, y_train_House_Sale)
print("MSE of Quadratic Model = ", metrics.mean_squared_error(y_test_House_Sale, Quadratic_model.predict(X_test_House_Sale2)))

# Cubic Model on Data Spilt V
poly = PolynomialFeatures(degree=3)
X_train_House_Sale3 = poly.fit_transform(X_train_House_Sale)
X_test_House_Sale3 = poly.fit_transform(X_test_House_Sale)
Cubic_model = LinearReg.fit(X_train_House_Sale3, y_train_House_Sale)
print("MSE of Cubic Model = ", metrics.mean_squared_error(y_test_House_Sale, Cubic_model.predict(X_test_House_Sale3)))

Split V

                                 OLS Regression Results                                
Dep. Variable:             TrueTarget   R-squared (uncentered):                   0.950
Model:                            OLS   Adj. R-squared (uncentered):              0.950
Method:                 Least Squares   F-statistic:                              3873.
Date:                Wed, 11 Mar 2020   Prob (F-statistic):                        0.00
Time:                        08:44:45   Log-Likelihood:                         -12373.
No. Observations:                1022   AIC:                                  2.476e+04
Df Residuals:                    1017   BIC:                                  2.478e+04
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

# Modelling Assignmnet Part II: 5-Ford Cross Validation

In [15]:
X_House_Sale = pd.DataFrame(House_Sale.iloc[:,:-1])
y_House_Sale = pd.DataFrame(House_Sale.iloc[:,-1])

In [16]:
# 5-fold Cross validation
Linear_kf = KFold(n_splits = 5,shuffle=False)

In [17]:
# Split Data into 5 sets as 4 sets into training and 1 set into testing
for (train_set, test_set) in Linear_kf.split(X_House_Sale):
    print(train_set, test_set)

[ 292  293  294 ... 1457 1458 1459] [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 23

In [18]:
#Train and Test data in 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X_House_Sale, y_House_Sale, test_size=0.2)

In [19]:
# Build Linear Regression
LinearModel = lm_model.LinearRegression()
LinearModel.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [28]:
#Compute MSE for Training Data
MSE_train = cross_val_score(LinearModel, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("MSE for Training Data = ", -MSE_train)

MSE for Training Data =  [1717764616.177595 3028280583.978971 1898719949.220005 1842847614.143595
 1680097332.988712]


In [21]:
#Average MSE value of Training Data
print("Average MSE of Training Data = ", -MSE_train.mean())

Average MSE of Training Data =  2033542019.3017757


In [22]:
#Model Performance of Training Data
Rsquare_train = cross_val_score(LinearModel, X_train, y_train, cv=5, scoring='r2') * 100
print("R^2 for Training Data = ", Rsquare_train)

R^2 for Training Data =  [72.8767979  51.64906636 70.04006121 68.95361301 76.67276422]


In [27]:
#Compute MSE for Validation Set
ASE_test = cross_val_score(LinearModel, X_test, y_test, cv=5, scoring='neg_mean_squared_error')
print("ASE for Validation Set = ", -ASE_test)

ASE for Validation Set =  [986270899.927222 690382923.504262 2213190858.045462 1004849684.505162
 706656521.663982]


In [24]:
#Average MSE value of Validation Set
print("Average ASE of Validation Set = ", -ASE_test.mean())

Average ASE of Validation Set =  1120270177.529218


In [25]:
#Model Performance of Validation Set
Rsquare_test = cross_val_score(LinearModel, X_test, y_test, cv=5, scoring='r2') * 100
print("R^2 for Testing Data = ", Rsquare_test)

R^2 for Testing Data =  [81.3209915  86.44093389 77.43077877 81.87174515 78.15202876]


In [None]:
#print("ASE of Training Model = ", np.sum(np.square(np.subtract(train_House_Sale.iloc[:,-1],Linear_model.predict(X_train_House_Sale))))/len(train_House_Sale.iloc[:,-1]))
#print("ASE of Validation Set = ", np.sum(np.square(np.subtract(test_House_Sale.iloc[:,-1],Linear_model.predict(X_test_House_Sale))))/len(test_House_Sale.iloc[:,-1]))
# MSE and ASE values are same for training or testing data

In [46]:
#MSE_train1 = cross_val_score(LinearModel, y_train, LinearModel.predict(X_train), cv=5, scoring='neg_mean_squared_error')
#print("MSE for Training Data = ", -MSE_train1)
#np.set_printoptions(formatter={'float_kind':'{:f}'.format})

MSE for Training Data =  [1026406955.979148 1009176738.126014 1200677612.855372 958986355.713669
 2310247293.670292]
