## Importing libaries
---

Importing the libraries needed for this project.

In [1]:
# import libraries to be used in this notebook
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression,Lasso,LassoCV,Ridge,RidgeCV
from sklearn import metrics
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# allow plots to appear directly in the notebook
%matplotlib inline

  import pandas.util.testing as tm


## Load the cleaned data

---

Reading the clean_train.csv and clean_test.csv data from datasets path.

In [2]:
#Read the training data
clean_train = pd.read_csv('../datasets/clean_train.csv')
clean_train.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,...,sale_type_Oth,sale_type_WD,exterior_1st_PreCast,exterior_2nd_Other,exterior_2nd_PreCast,heating_GasA,mas_vnr_type_CBlock,roof_matl_Metal,roof_matl_Roll,sale_type_VWD
0,109,533352170,60,0,13517,1,3,3,6,8,...,0,1,0,0,0,0,0,0,0,0
1,544,531379050,60,43,11492,1,3,3,7,5,...,0,1,0,0,0,0,0,0,0,0
2,153,535304180,20,68,7922,4,3,3,5,7,...,0,1,0,0,0,0,0,0,0,0
3,318,916386060,60,73,9802,4,3,3,5,5,...,0,1,0,0,0,0,0,0,0,0
4,255,906425045,50,82,14235,1,3,3,6,8,...,0,1,0,0,0,0,0,0,0,0


In [3]:
#Read the testing data
clean_test = pd.read_csv('../datasets/clean_test.csv')
clean_test.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,...,exterior_1st_CBlock,exterior_1st_ImStucc,exterior_1st_Stone,exterior_2nd_Stone,heating_OthW,heating_Wall,ms_zoning_C (all),neighborhood_GrnHill,neighborhood_Landmrk,roof_matl_Membran
0,2658,902301120,190,69,9142,4,3,3,6,8,...,0,0,0,0,0,0,0,0,0,0
1,2718,905108090,90,0,9662,1,3,3,5,4,...,0,0,0,0,0,0,0,0,0,0
2,2414,528218130,60,58,17104,1,3,3,7,5,...,0,0,0,0,0,0,0,0,0,0
3,1989,902207150,30,60,8520,4,3,3,5,6,...,0,0,0,0,0,0,0,0,0,0
4,625,535105100,20,0,9500,1,3,3,6,5,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Check the shape of train and test data
print('Train data dimension is: {}'.format(clean_train.shape))
print('Test data dimension is: {}'.format(clean_test.shape))

Train data dimension is: (2044, 191)
Test data dimension is: (879, 190)


In [5]:
#Defining the predictor columns

#Predictor columns will be each column in train data except the target variable: saleprice
predictor_columns = [col for col in clean_train.columns if col!='saleprice']


## Model Prep: Create our features matrix (`X`) and target vector (`y`)

Columns in predictor_columns list will be used as a feature.

The `saleprice` column is our target.

In [6]:
# Creating the features matrix(X) and target vector(y)
X = clean_train[predictor_columns]
y = clean_train['saleprice']

## Model Prep: Train/test split

Using the `train_test_split` function to split our `X` and `y` variables into a training set and testing set. <br>
The testing set will be used to test our model performance.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,shuffle=True)

In [8]:
# Check the shape of splitted data
print("Shape of X_train is :{}".format(X_train.shape))
print("Shape of X_train is :{}".format(X_test.shape))
print("Shape of X_train is :{}".format(y_train.shape))
print("Shape of X_train is :{}".format(y_test.shape))

Shape of X_train is :(1533, 190)
Shape of X_train is :(511, 190)
Shape of X_train is :(1533,)
Shape of X_train is :(511,)


## Model Prep: Scaling

Since, the features are in different scale, I will be scaling X_train and X_test using the StandardScaler.

In [9]:
#Check the first 2 rows of X_train data.
X_train.head(2)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,...,sale_type_Oth,sale_type_WD,exterior_1st_PreCast,exterior_2nd_Other,exterior_2nd_PreCast,heating_GasA,mas_vnr_type_CBlock,roof_matl_Metal,roof_matl_Roll,sale_type_VWD
754,1015,527252080,120,60,8118,4,3,3,9,5,...,0,1,0,0,0,0,0,0,0,0
1461,2416,528221010,20,102,11660,1,3,3,6,5,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#Check the first 2 rows of X_test data.
X_test.head(2)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,...,sale_type_Oth,sale_type_WD,exterior_1st_PreCast,exterior_2nd_Other,exterior_2nd_PreCast,heating_GasA,mas_vnr_type_CBlock,roof_matl_Metal,roof_matl_Roll,sale_type_VWD
1192,1582,916402215,20,93,18265,4,3,3,6,5,...,0,1,0,0,0,0,0,0,0,0
106,2513,533223080,160,0,2651,4,3,3,7,5,...,0,1,0,0,0,0,0,0,0,0


In [11]:
#Instantiate a StandardScaler object
ss = StandardScaler()

#Fit the X_train data into StandardScaler object
ss.fit(X_train)

#Apply the fit method to X_train and X_test data
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [12]:
# Check the scaled X_train
X_train[:1]

array([[-0.5337429 , -0.99419885,  1.51204157,  0.07464835, -0.35286271,
         0.74182915,  0.03428627,  0.21232686,  2.00566409, -0.50242615,
         1.175328  ,  1.08571292,  0.49532207,  2.66739119, -0.20654068,
         1.63229649,  0.14162878,  2.21793901,  1.15139306, -0.33413956,
         1.53706469,  0.87924237,  0.27529888,  0.28595565, -0.76318271,
        -0.10406774,  0.45969202,  1.13830408, -0.26179453,  0.77389466,
        -0.73461417, -1.03621045, -0.20048856,  2.19787873,  0.23964557,
         0.64340781,  1.21631769,  0.31114882,  0.3072278 ,  0.37372577,
         0.27597398,  0.2767261 ,  0.31544799,  0.51525529,  0.024131  ,
        -0.38915648, -0.09781913, -0.28722068, -0.06412177, -0.10403697,
         0.28224585,  0.15657827, -0.09940535, -0.21873932, -0.02554881,
        -0.08103079,  0.51767758, -0.41583058,  0.06268391,  4.64156414,
        -0.14128015, -2.92937707, -0.26698095, -0.1657706 , -0.06268391,
         0.59417591, -0.05114772, -0.09248044, -0.1

In [13]:
# Check the scaled X_test
X_test[:2]

array([[ 0.13716651,  1.06812893, -0.85129421,  1.10532115,  1.52675964,
         0.74182915,  0.03428627,  0.21232686, -0.07229247, -0.50242615,
         0.47252822,  0.09860919,  0.79552082,  0.99426898,  2.47673781,
         0.56430224,  1.78645395,  1.26739372,  1.15139306,  3.17432586,
         0.48404052,  0.87924237,  0.27529888,  0.28595565, -0.76318271,
        -0.10406774, -0.48274717, -0.80492549, -0.26179453,  0.77389466,
        -0.73461417,  0.19710525, -0.20048856,  0.71227552,  0.23964557,
         0.64340781,  0.11332152,  0.26629976, -0.79577036,  0.4691612 ,
         0.27597398,  0.2767261 ,  0.31544799,  1.53066803, -0.70599059,
        -0.38915648, -0.09781913, -0.28722068, -0.06412177, -0.10403697,
        -1.16485411,  0.15657827, -0.09940535, -0.21873932, -0.02554881,
        -0.08103079,  0.51767758, -0.41583058,  0.06268391, -0.21544462,
        -0.14128015,  0.34136951, -0.26698095, -0.1657706 , -0.06268391,
        -1.68300327, -0.05114772, -0.09248044, -0.1

## Model Prep: Baseline model
---

In [14]:
# Instantiate 3 models, LinearRegression, LassoCV and RidgeCV.
lrCV = LinearRegression()
lassoCV = LassoCV()
ridgeCV = RidgeCV()

> Evaluating `cross_val_score` R2-score for all `LinearRegression` model.

In [15]:
# cross_val_score for LinearRegression() model

lrCV_score = cross_val_score(lrCV, X_train, y_train, cv=5)

print('The R2 score for each fold in LinearRegression model for training data is: \n{}'.format(lrCV_score))
print('\nThe mean R2 score for LinearRegression model for training data is: {}'. format(lrCV_score.mean()))

The R2 score for each fold in LinearRegression model for training data is: 
[-1.11582932e+22 -9.70678462e+20 -7.11205283e+23 -5.10936477e+21
 -6.81987118e+20]

The mean R2 score for LinearRegression model for training data is: -1.4582512137737934e+23


> Evaluating `cross_val_score` R2-score for all `LassoCV` model.

In [16]:
base_lasso_score = cross_val_score(lasso, X_train, y_train,cv=5);
print("The R2 score for lasso model is : {}".format(base_lasso_score.mean()))

NameError: name 'lasso' is not defined

> Evaluating `cross_val_score` R2-score for all `RidgeCV` model.

In [None]:
base_ridge_score = > Evaluating `cross_val_score` R2-score for all `LinearRegression` model.cross_val_score(ridge, X_train, y_train,cv=5);
print("The R2 score for ridge model is : {}".format(base_ridge_score.mean()))

_R2 score for linear regression indicates that model complexity is too high. Therefore, its best to use regularization using Lasso & Ridge._

When comparing lasso and ridge, lasso seems to be performing slightly better than ridge.

## Model Prep: building models with optimum parameters.
---

Models I will be evaluating `LassoCV` and `RidgeCV` and `ElasticNetCV`.

In [None]:
# Instantiate 2 models, LassoCV and RidgeCV.
lassoCV = LassoCV()
ridgeCV = RidgeCV()

## Cross validation
---

Evaluating `cross_val_score` R2-score for 3 models with optimum parameters:<br>
1. Lasso
2. Ridge
3. ElasticNet

_R2 score for linear regression indicates that model complexity is too high. Therefore, its best to use regularization using Lasso & Ridge._

> Finding the optimum alpha for `Lasso` model **and** <br>
Evaluating `cross_val_score` R2-score with optimum alpha value in `Lasso` model.<br>


In [None]:
# Find optimum alpha for Lasso model.
lassoCV = LassoCV(cv=10,n_alphas=200).fit(X_train, y_train)

print('The optimum alpha value for LassoCV model is: {}'.format(lassoCV.alpha_))

In [None]:
# Instantiate the Lasso() model with optimum alpha value
lasso = Lasso(alpha=lassoCV.alpha_)

In [None]:
# Calculate the cross_val_score R2 score for Lasso model

lasso_score = cross_val_score(lasso, X_train, y_train,cv=5);

print('The R2 score for each fold in Lasso model for training data is: \n{}'.format(lasso_score))
print('\nThe mean R2 score for Lasso model for training data is: {}'. format(lasso_score.mean()))

> Finding the optimum alpha for `Ridge` model **and** <br>
Evaluating `cross_val_score` R2-score with optimum alpha value in `Ridge` model.<br>

In [None]:
# calculate the optimum alpha for Ridge model

r_alphas = list(np.logspace(-5, 5, 200))

#Iterate through the r_alphas and find the optimal alpha
ridgeCV = RidgeCV(alphas=[index for index in r_alphas]).fit(X_train, y_train)
print('The optimum alpha value for Ridge model is: {}'.format(ridgeCV.alpha_))

In [None]:
# Instantiate the Ridge() model with optimum alpha value
ridge = Ridge(alpha = ridgeCV.alpha_)

# Calculate the cross_val_score R2 score for Lasso model
ridge_score = cross_val_score(ridge, X_train, y_train, cv=5)

print('The R2 score for each fold in Ridge model for training data is: \n{}'.format(ridge_score))
print('\nThe mean R2 score for Ridge model for training data is: {}'. format(ridge_score.mean()))

> Finding the optimum alpha&ratio for `ElasticNet` model **and** <br>
Evaluating `cross_val_score` R2-score with optimum alpha and ratio value in `ElasticNet` model.<br>

In [None]:
# Import ElasticNet,ElasticNetCV models from sklearn
from sklearn.linear_model import ElasticNet, ElasticNetCV
enet_ratio = [0.1,0.3,0.5,0.8,0.9]

#Instantiate the model
enet_model = ElasticNetCV(n_alphas=100, l1_ratio=enet_ratio, cv=5)

#Fit the model
enet_model = enet_model.fit(X_train, y_train)

# Here is the optimal value of alpha & l1-ratio
enet_alpha = enet_model.alpha_
enet_l1ratio = enet_model.l1_ratio_
print('Optimum alpha and l1ratio are : {}, {}'.format(enet_alpha,enet_l1ratio))

#Instantiate the ElasticNet model with optimum values and calculate the R2 score
enet_model = ElasticNet(alpha=enet_alpha,l1_ratio=enet_l1ratio)

enet_score = cross_val_score(enet_model, X_test, y_test,cv=10);

print('\nThe R2 score for each fold in ElasticNet model for training data is: \n{}'.format(enet_score))
print('\nThe mean R2 score for ElasticNet model for training data is: {}'. format(enet_score.mean()))

------
#### Plotting the coefficients for both Lasso model <br>
This will help to understand what features dominate the `salesprice` the most.


In [None]:
# Create a dataframe for the lasso coefficients
coef_lasso = pd.DataFrame((lassoCV.coef_), index=predictor_columns)
coef_lasso.columns=['coefficients']

print("Total number of features in X_train were: {}".format(len(predictor_columns)))
print("-------------------------")
print("Number of regression coefficients shrinked to zero by lasso are: {}".format(coef_lasso[coef_lasso['coefficients']==0.000000].count()[0]))

In [None]:
# Dropping the features with coefficients of zero.
coef_lasso.drop(coef_lasso[coef_lasso['coefficients']==0.000000].index, inplace=True)

# Sort the coefficients in descending order
coef_lasso.sort_values(['coefficients'], ascending=False, inplace=True)

#Reset the index
coef_lasso.reset_index

# Plot the coefficients
fig = plt.figure(figsize=(15,25))
ax = sns.barplot(y=coef_lasso.index,x='coefficients', data=coef_lasso, orient="h");

In [None]:
# Print top 10 features with their coefficients (lasso model)

coef_lasso[:10]

In [None]:
# Print top 10 negatively correlated features with their coefficients (lasso model)

coef_lasso[-10:]

#### Plotting the coefficients for  Ridge model <br>

In [None]:
# Create a dataframe for the ridge coefficients
coef = pd.DataFrame((ridgeCV.coef_), index=predictor_columns)
coef.columns=['coefficients']

print("Total number of features in X_train were: {}".format(len(predictor_columns)))
print("-------------------------")
print("Number of regression coefficients shrinked to zero by ridge are: {}".format(coef[coef['coefficients']==0.000000].count()[0]))

In [None]:
# Dropping the features with coefficients of zero.
coef.drop(coef[coef['coefficients']==0.000000].index, inplace=True)

# Sort the coefficients in descending order
coef.sort_values(['coefficients'], ascending=False, inplace=True)

# Reset the index of dataframe
coef.reset_index

#Plot the coefficients
fig = plt.figure(figsize=(15,45))
ax = sns.barplot(y=coef.index,x='coefficients', data=coef, orient="h");

In [None]:
# Print top 10 features with their coefficients (ridge model)

coef[:10]

In [None]:
# Print top 10 negatively correlated features with their coefficients (ridge model)

coef[-10:]

On comparing the R2 score for all 4 models: `Lasso` seems to perform better than others. <br>
So, I will be using `Lasso` model for further training and testing.

## Model Fitting and Evaluation
---

`LassoCV` seems to be doing the best. <br>

- change to lasso and then optimum alpha for predicted ones.
predicted - split
scale
fit n predict

So, I will be fitting the model to the training data, evaluate the training and test scores and predicting the `salesprice`.

In [None]:
# Instantiate the Lasso model with already calculated optimum alpha value
lasso = Lasso(alpha = lassoCV.alpha_)

# Fit the model with X_train and y_train data
lasso.fit(X_train, y_train)

In [None]:
# Calculate the R2 score on training data
r2_score_train = lasso.score(X_train, y_train)
print('The R2 score for train data is: {:.4f}'.format(r2_score_train))

In [None]:
# Predicting the salesprice for X_test
pred = lasso.predict(X_test)

# Print first 5 predictions done.
pred[:5]

In [None]:
# Calculate the R2 score for predictions

r2_score_pred = r2_score(y_test, pred)
print('The predictions R2 score is : {}'.format(r2_score_pred))

In [None]:
# Calculate the residuals
residuals = y_test - pred

# Print first 5 residuals
residuals[:5]

In [None]:
# Plot the histogram for residuals
residuals.hist(bins=50)
plt.title('Histogram for residuals');

In [None]:
# Residual plot
fig = plt.figure(figsize=(10,7))

ax = plt.scatter(x=pred,y=residuals)
plt.plot([0,700000], [0,0], color='k', linestyle='--', linewidth=2);
plt.ylabel('Residuals')
plt.xlabel('Predicted values')
plt.title('Residual plot',fontsize=20);

#plt.ylim(-100000,)
#plt.xlim(-100000,)

**Residual plot interpretation**<br>
_The x-axis represents our predicted values and y-axis represnts the residual for each predicted value._

_The distance from the line at 0 is how bad the prediction was for that value._

_Since, in this residual plot, values are not substantially not far from line at 0, we can say that accuracy of model is good._


In [None]:
# Scatter plot to understand the relationship between predicted and actual values.
fig = plt.figure(figsize=(10,7))
ax = sns.regplot(y_test, pred)
ax.set_title('Actual vs predicted values', fontsize=20)
ax.set_xlabel('Actual values', fontsize=15)
ax.set_ylabel('Predicted values', fontsize=15);

**Predicted vs Actual values plot interpretation**<br>
Predictions have close correlation with the Actual values. Therefore, we can say that accuracy of model is good.

In [None]:
# Distribution plot for predicted and actual values
fig = plt.figure(figsize=(15,7))

ax1 = sns.distplot(y_test,hist=False, label='Actual test values', color='b')
sns.distplot(pred,hist=False,label='Predicted values', color='r',ax=ax1)

ax1.set_title('Distribution plot for actual and predicted values',fontsize=20);

### Feature selection based on lasso.

It makes sense to drop the features whose coefficients became zero by lasso from our initial predictors list.

In [None]:
# Take the features from lasso coefficients
final_predictor_features = list(coef_lasso.index)

print("Features used are: {}".format(len(final_predictor_features)))

## Model fitting on entire X_train dataset.

In this step, I will be training my model on entire training data.<br>

So, X_train_final will be entire dataset minus the target variable.<br>
y_train_final will be `saleprice` column in clean_train data.

X_test_final will be predictor columns, like in X_train_final.

In [None]:
# To see all columns in jupyter notebook
pd.set_option('display.max_columns', 200)

In [None]:
# Creating the feature and target variables.
X_train_final = clean_train[final_predictor_features]
y_train_final = clean_train['saleprice']

In [None]:
# Creating the X_test_final data
X_test_final = clean_test[final_predictor_features]

### Scaling the X_train_final and X_test_final data

In [None]:
# Check the first 2 rows of X_train_final
X_train_final.head(2)

In [None]:
# Check the first 2 rows of X_train_final
X_test_final.head(2)

In [None]:
#Instantiate a StandardScaler object
ss = StandardScaler()

#Fit the X_train_final data into StandardScaler object
ss.fit(X_train_final)

#Apply the fit method to X_train_final
X_train_final = ss.transform(X_train_final)
X_test_final = ss.transform(X_test_final)

In [None]:
# Check the scaled X_train_final
X_train_final[:2]

In [None]:
# Check the scaled X_test_final
X_test_final[:2]

In [None]:
#Instantiate the LassoCV() model
lasso_final = LassoCV()

In [None]:
# Find optimum alpha for Lasso model.
lasso_final = LassoCV(cv=10,n_alphas=200).fit(X_train_final, y_train_final)

print('The optimum alpha value for LassoCV model is: {}'.format(lassoCV.alpha_))

In [None]:
# Instantiate the Lasso() model with optimum alpha value
lasso = Lasso(alpha=lasso_final.alpha_)

# Calculate the cross_val_score R2 score for Lasso model
lasso_score = cross_val_score(lasso, X_train_final, y_train_final,cv=5);

print('The R2 score for each fold in Lasso model for training data is: \n{}'.format(lasso_score))
print('\nThe mean R2 score for Lasso model for training data is: {}'. format(lasso_score.mean()))

In [None]:
# Fit the model with X_train_final and y_train_final data

lasso.fit(X_train_final, y_train_final)

In [None]:
# Calculate the R2 score on training data
r2_score_train = lasso.score(X_train_final, y_train_final)
print('The R2 score for train data is: {:.4f}'.format(r2_score_train))

# Prediction

Predict the values of `saleprice` for the X_test_final data.

In [None]:
# Predicting the salesprice for X_test_final
predictions_final1 = lasso.predict(X_test_final)

# Print first 5 predictions done.
predictions_final1[:5]

In [None]:
clean_test['id'].head()

In [None]:
# Creating the dataframe for submissions
predicted_saleprice1 = pd.DataFrame(clean_test['id'])
predicted_saleprice1['SalePrice'] = predictions_final1

In [None]:
# Check the dataframe created
predicted_saleprice1.head()

In [None]:
# Save the predictions in csv format
predicted_saleprice1.to_csv('../datasets/predictions1.csv',index=False)