In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [52]:
df_housing = pd.read_csv('housing_data.csv')
df_housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [53]:
X = df_housing.drop('PRICE', axis=1)
y = df_housing['PRICE']

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, 
                                                    random_state=100)

In [55]:
X_train.shape

(354, 13)

In [56]:
X_test.shape

(152, 13)

In [2]:
from sklearn.linear_model import LinearRegression

In [58]:
lr = LinearRegression()
lr.fit(X_train, y_train)  # Train the model

In [59]:
y_pred_test = lr.predict(X_test)  # Get predictions

In [3]:
from sklearn.metrics import r2_score, mean_squared_error, \
mean_absolute_error, mean_absolute_percentage_error 

In [61]:
# Measure performance on Test data
test_r2 = r2_score(y_test, y_pred_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_pred_test)
test_mape = mean_absolute_percentage_error(y_test, y_pred_test)

print('Test metrics')
print('R2:', test_r2)
print('MSE:', test_mse)
print('RMSE:', test_rmse)
print('MAE:', test_mae)
print('MAPE:', test_mape)

Test metrics
R2: 0.7057919873264542
MSE: 29.79884430147881
RMSE: 5.458831770761837
MAE: 3.471947848270165
MAPE: 0.17090351588701452


In [62]:
y_pred_train = lr.predict(X_train)  # Get predictions

In [63]:
# Compute performace for training data
train_r2 = r2_score(y_train, y_pred_train)
train_mse = mean_squared_error(y_train, y_pred_train)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_pred_train)
train_mape = mean_absolute_percentage_error(y_train, y_pred_train)

print('train metrics')
print('R2:', train_r2)
print('MSE:', train_mse)
print('RMSE:', train_rmse)
print('MAE:', train_mae)
print('MAPE:', train_mape)

train metrics
R2: 0.752890983596846
MSE: 19.067391155385046
RMSE: 4.366622396702633
MAE: 3.1273498053306574
MAPE: 0.1569169678601374


In [64]:
#### End

In [65]:
### DAY 3
# Build Linear Regression and Capture R2 and RMSE

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print('Train R2 ', train_r2)
print('Test R2  ', test_r2)

train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
print("train RMSE ", train_rmse)
print("test RMSE ", test_rmse)

result_lr = ['LR Full model', train_r2, test_r2, train_rmse, test_rmse]
result_lr

Train R2  0.752890983596846
Test R2   0.7057919873264542
train RMSE  4.366622396702633
test RMSE  5.458831770761837


['LR Full model',
 0.752890983596846,
 0.7057919873264542,
 4.366622396702633,
 5.458831770761837]

In [66]:
lr.intercept_

33.11584094298593

In [67]:
lr.coef_

array([-7.69175693e-02,  3.95527497e-02, -6.09889801e-03,  2.63034959e+00,
       -1.31416178e+01,  3.84418005e+00, -1.20624925e-02, -1.37626293e+00,
        2.83925319e-01, -1.40640197e-02, -9.18566330e-01,  1.05783116e-02,
       -4.37960459e-01])

In [68]:
LR_coefs = pd.DataFrame()
LR_coefs['Col Name'] = X.columns
LR_coefs['Coef'] = lr.coef_
LR_coefs

Unnamed: 0,Col Name,Coef
0,CRIM,-0.076918
1,ZN,0.039553
2,INDUS,-0.006099
3,CHAS,2.63035
4,NOX,-13.141618
5,RM,3.84418
6,AGE,-0.012062
7,DIS,-1.376263
8,RAD,0.283925
9,TAX,-0.014064



### Feature Selection

In [69]:
#!pip3 install mlxtend

In [4]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

In [71]:
# Forward
lr = LinearRegression()
lr_sfs =  sfs(estimator=lr, k_features='best', forward=True, verbose=2 )

sfs_forward = lr_sfs.fit(X_train,y_train)
forward_features = list(sfs_forward.k_feature_names_)
forward_features

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.0s finished

[2023-04-06 17:25:34] Features: 1/13 -- score: 0.5261603029632435[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished

[2023-04-06 17:25:34] Features: 2/13 -- score: 0.6241817903347533[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s finished

[2023-04-06 17:25:34] Features: 3/13 -- score: 0.6706654496693151[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

['CRIM',
 'ZN',
 'CHAS',
 'NOX',
 'RM',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT']

In [50]:
lr = LinearRegression()    # best feature took me# best feature output
lr_sfs =  sfs(estimator=lr, k_features='best', forward=False, verbose=2 )

sfs_back = lr_sfs.fit(X_train,y_train)
backward_features = list(sfs_back.k_feature_names_)
backward_features

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.0s finished

[2023-04-05 14:40:38] Features: 12/1 -- score: 0.7203749681521989[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished

[2023-04-05 14:40:38] Features: 11/1 -- score: 0.7230451718082934[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s finished

[2023-04-05 14:40:39] Features: 10/1 -- score: 0.721346872112844[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   

['CRIM',
 'ZN',
 'CHAS',
 'NOX',
 'RM',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT']

In [23]:
#RFE     Recursive Feature Elimination

In [5]:
from sklearn.feature_selection import RFE

In [25]:
lr = LinearRegression()
lr_rfe =  RFE(estimator=lr, n_features_to_select=5)
rfe_model = lr_rfe.fit(X_train,y_train)
rfe_model.ranking_

array([3, 5, 9, 1, 1, 1, 7, 1, 4, 6, 1, 8, 2])

In [26]:
rfe_rank = pd.DataFrame()
rfe_rank['rank'] = rfe_model.ranking_
rfe_rank['features'] = X_train.columns
rfe_list = list(rfe_rank[rfe_rank['rank'] == 1]['features'])
rfe_list

['CHAS', 'NOX', 'RM', 'DIS', 'PTRATIO']

In [27]:
# Now run linear regression with selected columns

In [28]:
#Back model


In [29]:
lr = LinearRegression()
lr.fit(X_train[backward_features], y_train)   # Only selected columns

y_pred_train = lr.predict(X_train[backward_features]) # Only selected columns
y_pred_test = lr.predict(X_test[backward_features]) # Only selected columns

train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print('Train R2 ', train_r2)
print('Test R2  ', test_r2)

train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
print("train RMSE ", train_rmse)
print("test rmse ", test_rmse)

result_back = ['Backward', train_r2, test_r2, train_rmse, test_rmse]   # Backward
result_back

Train R2  0.752434513097503
Test R2   0.7081692908935964
train RMSE  4.370653643087017
test rmse  5.436732403284537


['Backward',
 0.752434513097503,
 0.7081692908935964,
 4.370653643087017,
 5.436732403284537]

In [None]:
# Forward model


In [31]:
lr = LinearRegression()
lr.fit(X_train[forward_features], y_train)   # Only selected columns

y_pred_train = lr.predict(X_train[forward_features])  # Only selected columns
y_pred_test = lr.predict(X_test[forward_features])    # Only selected columns

train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print('Train R2 ', train_r2)
print('Test R2  ', test_r2)

train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
print("train RMSE ", train_rmse)
print("test rmse ", test_rmse)

result_forward = ['Forward', train_r2, test_r2, train_rmse, test_rmse]   # Forward
result_forward

Train R2  0.752434513097503
Test R2   0.7081692908935964
train RMSE  4.370653643087017
test rmse  5.436732403284537


['Forward',
 0.752434513097503,
 0.7081692908935964,
 4.370653643087017,
 5.436732403284537]

In [None]:
# RFE 

In [32]:
lr = LinearRegression()
lr.fit(X_train[rfe_list], y_train)   # Only selected columns

y_pred_train = lr.predict(X_train[rfe_list])  # Only selected columns
y_pred_test = lr.predict(X_test[rfe_list])   # Only selected columns

train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print('Train R2 ', train_r2)
print('Test R2  ', test_r2)

train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
print("train RMSE ", train_rmse)
print("test rmse ", test_rmse)

result_rfe = ['RFE', train_r2, test_r2, train_rmse, test_rmse]   ##RFE
result_rfe

Train R2  0.6507796725615075
Test R2   0.6003438527836649
train RMSE  5.191000844769799
test rmse  6.362322963894048


['RFE',
 0.6507796725615075,
 0.6003438527836649,
 5.191000844769799,
 6.362322963894048]

In [33]:
# gatheer results
results_df = pd.DataFrame(columns=['Method', 'Train R2', 'Test R2', 'Train RMSE', 'Test RMSE'])
results_df.loc[len(results_df)] = result_lr
results_df.loc[len(results_df)] = result_back
results_df.loc[len(results_df)] = result_forward
results_df.loc[len(results_df)] = result_rfe
results_df

Unnamed: 0,Method,Train R2,Test R2,Train RMSE,Test RMSE
0,LR Full model,0.752891,0.705792,4.366622,5.458832
1,Backward,0.752435,0.708169,4.370654,5.436732
2,Forward,0.752435,0.708169,4.370654,5.436732
3,RFE,0.65078,0.600344,5.191001,6.362323


In [35]:
results_df.sort_values('Test R2', ascending=False)

Unnamed: 0,Method,Train R2,Test R2,Train RMSE,Test RMSE
1,Backward,0.752435,0.708169,4.370654,5.436732
2,Forward,0.752435,0.708169,4.370654,5.436732
0,LR Full model,0.752891,0.705792,4.366622,5.458832
3,RFE,0.65078,0.600344,5.191001,6.362323


In [None]:
# To understand how "K - Fold" data split works
# (to understand internal working of K-fold cross validation)

In [6]:
from sklearn.model_selection import KFold

In [38]:
temp_data = [30,31,32,33,34,35,36,37,38,39]
kf = KFold(n_splits=5)
list(kf.split(temp_data) )

[(array([2, 3, 4, 5, 6, 7, 8, 9]), array([0, 1])),
 (array([0, 1, 4, 5, 6, 7, 8, 9]), array([2, 3])),
 (array([0, 1, 2, 3, 6, 7, 8, 9]), array([4, 5])),
 (array([0, 1, 2, 3, 4, 5, 8, 9]), array([6, 7])),
 (array([0, 1, 2, 3, 4, 5, 6, 7]), array([8, 9]))]

In [None]:
# Perform k-fold cross validation using manually coding 

In [39]:
kf = KFold(n_splits=5)


In [40]:
# Note: In the below code 'test' is Validation. 
for train_index, test_index in kf.split(X_train, y_train):   # For each split perform the loop 
    X_train_k = X_train.iloc[train_index]  # using Train index, select the training records (80% data) from X_train
    X_test_k  = X_train.iloc[test_index]   # using Train index, select the Validation records (20% data) from X_train
    
    y_train_k = y_train.iloc[train_index]
    y_test_k  = y_train.iloc[test_index]
    
    lr_k = LinearRegression()
    lr_k.fit(X_train_k, y_train_k)
    y_pred_k = lr_k.predict(X_test_k)
    r2_score_K = r2_score(y_test_k, y_pred_k)
    print('R2 score ', r2_score_K )



R2 score  0.626395963002877
R2 score  0.726944647311314
R2 score  0.8244938890651446
R2 score  0.7328369559586241
R2 score  0.6687112545444382


In [41]:
# Using cross_val_score function to perform K-fold cross validation

from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr, X_train, y_train, cv=5, scoring='r2')
print("Cross-validated scores:", scores )
print("Average score:" , np.average(scores))

Cross-validated scores: [0.62639596 0.72694465 0.82449389 0.73283696 0.66871125]
Average score: 0.7158765419764797


In [44]:
lr.fit(X_train, y_train)


lr.score(X_test, y_test)

0.7057919873264542

##### LOOCV

In [7]:
from sklearn.model_selection import LeaveOneOut

In [47]:
loocv = LeaveOneOut()

In [48]:
rmse_loocv = []

for train_index, test_index in loocv.split(X_train, y_train):
    X_train_k = X_train.iloc[train_index]
    X_test_k  = X_train.iloc[test_index]
    
    y_train_k = y_train.iloc[train_index]
    y_test_k  = y_train.iloc[test_index]
    
    lr_k = LinearRegression()
    lr_k.fit(X_train_k, y_train_k)
    y_pred_k = lr_k.predict(X_test_k)
    rmse_K = np.sqrt(mean_squared_error(y_test_k, y_pred_k) )
    #print('RMSE ', rmse_K )
    rmse_loocv.append(rmse_K)

print(rmse_loocv)



[1.6807157524205678, 2.1388933273786606, 4.2646917399388915, 5.769632824475771, 1.0694538297129519, 5.953962797805428, 3.8615356452914646, 0.17650620698807273, 1.5826031308405781, 0.09211104521802582, 2.068571796811181, 3.659133414221092, 9.972600831049064, 3.588380780530425, 7.236475115030018, 2.252380191718043, 4.349711157190285, 1.0718007208568032, 0.7356827961499555, 5.226729013355126, 2.8608281993711238, 1.2484236020445394, 0.11020983085386149, 1.8328583635786373, 3.796355128847054, 2.114669746514455, 2.4063009342821573, 3.2081439977312485, 6.897209677229615, 0.4807499197400489, 1.6945131301235996, 6.109262100356791, 1.9752748694754239, 0.37818619404527, 0.8757945931065727, 3.7058174069928107, 4.744185439836862, 2.4071045294769604, 2.3928821006306222, 3.085788535878315, 12.172209209997707, 2.435455545156586, 0.8979615874352795, 2.556687782355789, 4.152164600015725, 7.4981051168159, 18.59157311488472, 2.4172134641901195, 2.7253876366459835, 4.481303140984469, 0.4070345226951595, 4.