In [None]:
#Importing the libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stat

pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 200)



from sklearn import linear_model, metrics
from sklearn.linear_model import Ridge,Lasso

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler



In [None]:
#hide warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
#lets load the dataframe
housing=pd.read_csv('../input/house-price-prediction/train.csv')
housing.head()

In [None]:
housing.shape

In [None]:
housing.info()#checking overall values and type of variable 

In [None]:
#checking some more statistical measures of numeric columns and overall it is evident that there are outliers in some variable 
#we will take care of that in EDA 
housing.describe(percentiles=[0.05,.25, .5, .75, .90, .95, .99]).T


# EDA - Explorartory Data Analysis
#### Let's understand and examine the data in order to get a clean dataset to work on further 

- we will deal with the Null values and impute it 
- we are going to look for the outlier and treat them to make the data clean 
- we will make make some **Visualization** to have a clear picture how the variables are behaving before proceeding to **Model building **.

For visualisation we are using `Matplotlib ` and `seaborn` 


In [None]:
''' Checking Null Values '''
#finding out the total null values and  null percentage in each column 

total = pd.DataFrame(housing.isnull().sum().sort_values(ascending=False), columns=['Total_null'])
percentage = pd.DataFrame(round(100*(housing.isnull().sum()/housing.shape[0]),2).sort_values(ascending=False)\
                          ,columns=['Null_Percentage'])
pd.concat([total, percentage], axis = 1)

In [None]:
#visualising the percentage null in each columns distributions 
null_data=round(housing.isnull().sum()/len(housing)*100,2).sort_values(ascending=False)
plt.figure(figsize=[12,5],dpi=150)
plt.rc('xtick', labelsize=8)
plt.rc('ytick', labelsize=7)
null_data = null_data[null_data>0]
null_data.plot.bar()
plt.show()


### Insights from Null Values 

- From both the table and visualization it is clear that there are few columns which contains high Null values i. more than 80 %,they are as follows 


                   `PoolQC`
                   `MiscFeature
                   `Alley`
                   `Fence`
- we will drop these columns 
- The remaining columns which have Null values under 20 percent we are going to examine them and do the imputation further .
    
### Now delve into the data set and understand it separately for `Numerical Columns ` and `Categorical Columns`.



In [None]:
#making all numeric columns( integer and float ) in one dataset 
housing_num = housing.select_dtypes(include=['float64', 'int64'])
print(housing_num.shape)
housing_num.head()



In [None]:
# making all Categoric variables(object type ) in one  dataset
housing_obj = housing.select_dtypes(exclude=['float64', 'int64'])
print(housing_obj.shape)
housing_obj.head()


- There are 38 numerical columns and 43 non-numeric or object type columns present in the dataset 

## Understanding Numerical Columns 

### Imputation of Numerical Columns 

In [None]:
#checking the null values before imputation present in the numerical dataframe 

for column in housing_num.columns.values:
    if housing_num[column].isnull().values.sum() != 0:
        missing_percentage=housing_num[column].isnull().values.sum()/len(housing_num)
        print(column, missing_percentage)

In [None]:
housing_num[['LotFrontage','MasVnrArea','GarageYrBlt']].describe()

In [None]:
#Imputing `LotFrontage`,`MasVnrArea`,`GarageYrBlt` with  mean values 

housing_num['LotFrontage'].fillna(housing_num['LotFrontage'].mean(),inplace=True) 
housing_num['MasVnrArea'].fillna(housing_num['MasVnrArea'].mean(),inplace=True)   
housing_num['GarageYrBlt'].fillna(housing_num['GarageYrBlt'].mean(),inplace=True) 

In [None]:
#Rechecking the Null values present int he dataset after imputation 

housing_num.isnull().sum()


- there are no more null values in the numeric dataset 

### Let's check the value counts in each column 

In [None]:
housing_num

In [None]:
for i in housing_num:
    print(i,':\n',housing_num[i].value_counts(dropna=False),sep='',end='\n--------------------------\n\n')

 `1.From the above observation we can conclude that ``ID`` is contains unique values and not essential for the modeling so we can drop it .`

-`` 2.there are few columns which have only some discrete values so it can be categorised ,so we need to change its data type from `int` to `obj``

In [None]:
housing_num.info()

In [None]:
housing_num.drop(columns=['Id'],inplace=True)


### Descriptive analysis of target variable " SalePrice"

- `SalePrice` is our target varible and we need to check certain things about it before proceeding further .
-  we need to check the distibution of the dependednt variable must be following `normal distribution` in order tto satisfy the assumptions of `Linear Regression` .
- we can check this by visualizing it in `dist plot`.

In [None]:
plt.figure(figsize=(6,3.2),dpi=150)
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)

sns.distplot(housing_num['SalePrice'])
plt.show()



   - we can see that `SalePrice` is rightly skewed.
   - we need to transform it with `log` to make it rightly distributed normally and perform `linear regression`

In [None]:
#log transforming the  predictor variable
housing_num['SalePrice'] = np.log1p(housing_num['SalePrice'])

In [None]:
#checking post -trasnform 
plt.figure(figsize=(6,3.2),dpi=150)
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)
sns.distplot(housing_num['SalePrice'])
plt.show()


- Now the target variable is rightly skewed 

### Now let's check outliers in the dataframe 
- we will visualize with boxplot 


In [None]:
for values in housing_num:  
    plt.figure(figsize=(8,6))
    plt.rc('xtick', labelsize=11)
    plt.rc('ytick', labelsize=11)
    sns.boxplot(x=values,data=housing_num)
plt.show()

### lets first check  if there is any relatioship going on with target variable visually by scatter plot 

In [None]:
for features in housing_num.columns.values:
    plt.figure(figsize=(5,4))
    plt.scatter(housing_num[features],housing_num['SalePrice'], alpha = 0.3)
    plt.title("SalePrice vs "+str(features))
    plt.xlabel(str(features))
    plt.ylabel('SalePrice')
plt.show()

###  There are some derived Metric present in the dataset lets treat them 


In [None]:
# Converting years to age
housing_num['YearBuilt_Age'] = housing_num['YearBuilt'].max() - housing_num['YearBuilt']
housing_num['YearRemodAdd_Age'] = housing_num['YearRemodAdd'].max() - housing_num['YearRemodAdd']
housing_num['YrSold_Age'] = housing_num['YrSold'].max() - housing_num['YrSold']
housing_num['GarageYrBlt_Age'] = housing_num['GarageYrBlt'].max() - housing_num['GarageYrBlt']

# Dropping columns
housing_num.drop(['YearBuilt','YearRemodAdd','YrSold','GarageYrBlt'], axis=1, inplace = True)

In [None]:
housing_num[['YearBuilt_Age','YearRemodAdd_Age','YrSold_Age','GarageYrBlt_Age']].head(10)

In [None]:
fig = plt.figure(figsize=(25,23),dpi=150)
plt.rc('xtick', labelsize=16)
plt.rc('ytick', labelsize=16)
sns.heatmap(housing_num.corr(), annot = True, cmap="Greens",fmt='.1f')
plt.show()

### Insights 


- from the above correlation heatmap we have analysed that there are some multi colinearity is going on on a high scale with the target variable "saleprice"
- they are as follows 

                   1.OverallQual-0.8
                   2.TotalBsmtSF-0.6       
                   3.1stFlrSF   -0.6
                   4.GrLivArea  -0.6       
                   5.FullBath   -0.7 
                   6.GarageCars -0.7
                   7.GarageArea -0.7
                   

        
- All the above variables have colinnearity threshhold more than 0.5 so we need to eliminate these features
        
- there are few other variables who shows the multicolinearity among themselves 
-  they are as follows
`
                     1.TotalBsmtSF & 1stFlrSF
                     2.GarageCars & GarageArea
                     3.YearBuilt_Age & YearRemodAdd_age
                     4.YearBuilt_Age   & GarageYrBlt_Age  

   `
  - we need to drop these variables as well
                   
                     
                                


In [None]:
#removing the features 
housing_num.drop(columns=['GarageArea','GarageYrBlt_Age','YearRemodAdd_Age','1stFlrSF','TotRmsAbvGrd','YearBuilt_Age','YrSold_Age'],inplace=True)
     

##### There are few columns who are highly un correlated withe the target variable concluded both from the scatter plot and heatmap 
they are 
`'YrSold','MoSold','PoolArea','LowQualFinSF','MSSubClass','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','MiscVal''

In [None]:
#dropping those non coreelated varibles
housing_num.drop(columns=['PoolArea','LowQualFinSF','MSSubClass','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','MiscVal','BsmtFinSF2','BsmtHalfBath'],inplace=True)
     


In [None]:
#after doing some manual feature elimination checking heatmap again 
fig = plt.figure(figsize=(15,12),dpi=150)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
sns.heatmap(housing_num.corr(), annot = True, cmap="Greens",fmt='.1f')
plt.show()

## understanding categorical variable 

### checking Null values and impute them 


In [None]:
#checking the null values before imputation present in the numerical dataframe 

for column in housing_obj.columns.values:
    if housing_obj[column].isnull().values.sum() != 0:
        missing_percentage=(housing_obj[column].isnull().values.sum()/len(housing_num))
        print(column, missing_percentage)

In [None]:
#chceking the value counts present in each categorical columns 
for i in housing_obj:
    print(i,':\n',housing_obj[i].value_counts(dropna=False),sep='',end='\n--------------------------\n\n')

### we observe that 


1.Alley,PoolQC,Fence,MiscFeature - these variables have very high Null value i.e more than 95 percent so we can drop these from the data set .

2. columns have largely one value present so we can drop these columns as well.they are 
                                `Street,Utilities,'Condition2','RoofMatl','Heating','Functional','PavedDrive','GarageCond','Electrical','LandSlope'

3.There are some Misinterpretation in the dataset so we have done the necessary imputations in the dataset .

In [None]:
# Remove the Columns with Majority NaN Values in it which Include
housing_obj.drop(columns=['Alley','PoolQC','Fence','MiscFeature'],inplace=True)


housing_obj['FireplaceQu'].fillna('No Fireplace',inplace=True)                  
housing_obj['MasVnrType'].fillna(stat.mode(housing_obj['MasVnrType']),inplace=True)   
housing_obj['Electrical'].fillna(stat.mode(housing_obj['Electrical']),inplace=True)   
housing_obj['BsmtQual'].fillna('No Basement',inplace=True)                          
housing_obj['BsmtCond'].fillna('No Basement',inplace=True)                      
housing_obj['BsmtExposure'].fillna('No Basement',inplace=True)               
housing_obj['BsmtFinType1'].fillna('No Basement',inplace=True)           
housing_obj['BsmtFinType2'].fillna('No Basement',inplace=True)                  
housing_obj['GarageType'].fillna('No Garage',inplace=True)                      
housing_obj['GarageFinish'].fillna('No Garage',inplace=True)         
housing_obj['GarageQual'].fillna('No Garage',inplace=True)                      
housing_obj['GarageCond'].fillna('No Garage',inplace=True)                   

In [None]:
#so dropping dominant  variables 
housing_obj.drop(columns=['Street','Utilities','Condition2','RoofMatl','Heating','Functional','PavedDrive','GarageCond','Electrical','LandSlope','BsmtFinType2'],inplace=True)

In [None]:
#lets check the Null values again after imputation 
housing_obj.isnull().sum()

## 3. Data Preparation 


#### Data Preparation

Let's now prepare the data and build the model.

In [None]:
# creating dummy variables for categorical variables
# convert into dummies - one hot encoding
housing_dummies = pd.get_dummies(housing_obj, drop_first=True)
print(housing_dummies.shape)
housing_dummies.head()

In [None]:
#making the final clean dataset to build the model 
df=pd.concat([housing_num,housing_dummies],axis=1)

### Splitting the Data into Training and Testing Sets

In [None]:
X= df.drop('SalePrice',axis=1)
y= df['SalePrice']

In [None]:
X.shape

In [None]:
X.head()

In [None]:
y.shape

In [None]:
y.head()

In [None]:
#splitting train and test columns using sklearn library 
np.random.seed(0)
X_train, X_test,y_train,y_test = train_test_split(X,y, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
#cheking shape of train test data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Data scaing 

In [None]:
scaler = StandardScaler()

cols=list(housing_num.columns.values)
cols.remove('SalePrice')

X_train[cols] = scaler.fit_transform(X_train[cols])
X_train.head()

In [None]:
y_train.head()

In [None]:
X_test[cols] = scaler.fit_transform(X_test[cols])
X_test.head()

## Model Building

### RFE

`First we will go with RFE for feature selaction of top 40 features and build a basic Regression Model `


In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
from sklearn.feature_selection import RFE
rfe = RFE(linreg, 40)
rfe = rfe.fit(X_train, y_train)
print(rfe.support_)

In [None]:
col=X_train.columns[rfe.support_]
col

In [None]:
import statsmodels.api as sm
X_train_rfe = sm.add_constant(X_train[list(col)])

In [None]:
lm = sm.OLS(y_train, X_train_rfe).fit()
print(lm.summary())

## Ridge Regression - L2

In [None]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}

ridge = Ridge()

# cross validation :

folds = 5
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train) 

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results = cv_results[cv_results['param_alpha']<=200]
cv_results[['param_alpha','params','mean_test_score','mean_train_score']]

In [None]:
model_cv.best_params_

In [None]:
model_cv.best_score_

In [None]:
# plotting mean test and train scores with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')
# plotting
plt.figure(figsize=(10,8))
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
alpha = 10.0
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)
y_pred_ridge_train=ridge.predict(X_train)
print('Train R2 Square : ',round(r2_score(y_train,y_pred_ridge_train),2))
y_pred_ridge_test=ridge.predict(X_test)
print('Test R2 Square : ',round(r2_score(y_test,y_pred_ridge_test),2))

In [None]:
sns.distplot((y_train-y_pred_ridge_train))

In [None]:
#Ridge model parameters
model_parameters = list(ridge.coef_)
model_parameters.insert(0, ridge.intercept_)
model_parameters = [round(x, 2) for x in model_parameters]
cols = X.columns
cols = cols.insert(0, "constant")
var_coef = list(zip(cols, model_parameters))
var_coef = [x for x in var_coef if abs(x[-1] != 0)]
var_coef

In [None]:
df1 = {'Feature':list(list(zip(*var_coef))[0]),'Coeff':list(list(zip(*var_coef))[1])}
ridge_params = pd.DataFrame(data = df1)


In [None]:
ridge_params.reindex(ridge_params.Coeff.abs().sort_values(ascending = False).index)

## Lasso Regularization - L1

In [None]:
lasso = Lasso()
# cross validation
model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

model_cv.fit(X_train, y_train) 

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results[['param_alpha','params','mean_test_score','mean_train_score']]

In [None]:
# plotting mean test and train scoes with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')

# plotting
plt.figure(figsize=(10,8))
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
model_cv.best_params_

In [None]:
model_cv.best_score_

In [None]:
alpha = 0.0001
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train)
y_pred_lasso_train=lasso.predict(X_train)
print('Train R2 Square : ',round(r2_score(y_train,y_pred_lasso_train),2))
y_pred_lasso_test=lasso.predict(X_test)
print('Test R2 Square : ',round(r2_score(y_test,y_pred_lasso_test),2))
#lasso.coef_

In [None]:
sns.distplot((y_train-y_pred_lasso_train))

In [None]:
#lasso model parameters
model_parameters = list(lasso.coef_)
model_parameters.insert(0, lasso.intercept_)
model_parameters = [round(x, 3) for x in model_parameters]
cols = X.columns
cols = cols.insert(0, "constant")
var_coeff = list(zip(cols, model_parameters))
var_coeff = [x for x in var_coeff if abs(x[-1] != 0)]
var_coeff

In [None]:
df2 = {'Feature':list(list(zip(*var_coef))[0]),'Coeff':list(list(zip(*var_coef))[1])}
lasso_params = pd.DataFrame(data = df2)


In [None]:
lasso_params.reindex(lasso_params.Coeff.abs().sort_values(ascending = False).index)

In [None]:
print('r2_score in train dataset:')
print('r2_score for ridge:', round(r2_score(y_train, y_pred_ridge_train), 2))
print('r2_score for lasso:', round(r2_score(y_train, y_pred_lasso_train), 2))

print('r2_score in test dataset:')
print('r2_score for ridge:', round(r2_score(y_test, y_pred_ridge_test), 2))
print('r2_score for lasso:', round(r2_score(y_test, y_pred_lasso_test), 2))

# Inferences


`we have done both Ridge and Lasso Regression in the dataset 
we have concluded top 10 variables which manupulates the price of houses and can be used to the best for our business purpose `

`so the predictor variables we got from Ridge Regularization ` :

                            1.MSZoning(RH,RM,FV,RL)
                            2.SaleType_ConLD
                            3.Neighborhood_(Crawfor,MeadowV,StoneBr,Somerst)
                            4.GrLivArea
                            6.LandContour_Low
                            7.Exterior1st_BrkFace
                            8.KitchenQual_Fa
                            9.OverallQual`
                           

                            
  `The predictor variables from Lasso Regularization :`
  
                           
                            1.MSZoning(RH,RM,FV,RL)
                            2.SaleType_ConLD
                            3.Neighborhood_(Crawfor,MeadowV,StoneBr,Somerst)
                            4.GrLivArea
                            5.BldgType_Twnhs
                            6.LandContour_Low
                            7.GarageQual_Gd
                            8.KitchenQual_Fa
                            9.OverallQual
                            10.Exterior1st_BrkFace
                           `
         

# Assignment Questions and answers 

`What is the optimal value of alpha for ridge and lasso regression? What will be the changes in the model if you choose double the value of alpha for both ridge and lasso? What will be the most important predictor variables after the change is implemented?`

`What is the optimal value of alpha for ridge and lasso regression?`


`The optimal values of lambda i.e alpha for Ridge =10
The optimal values of lambda i.e alpha for Lasso=0.001`

In [None]:
#lets make the alpha double and compare the outcome for Ridge
alpha = 20
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)
y_pred_ridge_train=ridge.predict(X_train)
print(round(r2_score(y_train,y_pred_ridge_train),2))
y_pred_ridge_test=ridge.predict(X_test)
print(round(r2_score(y_test,y_pred_ridge_test),2))

In [None]:
model_parameters = list(ridge.coef_)
model_parameters.insert(0, ridge.intercept_)
model_parameters = [round(x, 3) for x in model_parameters]
cols = X.columns
cols = cols.insert(0, "constant")
var_coeff = list(zip(cols, model_parameters))
var_coeff = [x for x in var_coeff if abs(x[-1] != 0)]
df3 = pd.DataFrame.from_records(var_coeff, columns =['Features', 'B-Coeff'])
df3['B-Coeff']=df3['B-Coeff'].abs()
df3=df3.sort_values(by=['B-Coeff'],ascending=False)
df3


In [None]:
print('Top 5 Predictor Variables using Ridge after Doubling the Alpha :',df3[1:6].values)

In [None]:
#for Lasso doubling the value of alpha
alpha = 0.002
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train) 
y_pred_lasso_train=lasso.predict(X_train)
print(round(r2_score(y_train,y_pred_lasso_train),2))
y_pred_lasso_test=lasso.predict(X_test)
print(round(r2_score(y_test,y_pred_lasso_test),2))

In [None]:
model_parameters = list(lasso.coef_)
model_parameters.insert(0, lasso.intercept_)
model_parameters = [round(x, 3) for x in model_parameters]
cols = X.columns
cols = cols.insert(0, "constant")
var_coeff = list(zip(cols, model_parameters))
var_coeff = [x for x in var_coeff if abs(x[-1] != 0)]
df = pd.DataFrame.from_records(var_coeff, columns =['Features', 'B-Coeff'])
df['B-Coeff']=df['B-Coeff'].abs()
df=df.sort_values(by=['B-Coeff'],ascending=False)
print('Top 5 Predictor Variables using Lasso after Doubling the Alpha :',df[1:6].values)


` 2 Question 2`

 `You have determined the optimal value of lambda for ridge and lasso regression during the assignment. Now, which o   one will you choose to apply and why? `

`We have performed both Lasso and Ridge and it is very much evnident that we should use Lasso over Ridge as `
      - Ridge dont dont do feature elimination and takes all teh varible into consideration where as Lasso do feature elimination by means of applying hard penality making the coeeficient  to shink to absolute zero but Ridge minimises  the coeeficient tend to zero .
      
      -So lasso makes the model more simple and deduct the unnecessary columns for large datasets 
      
      -So here we are gonna use Lasso over Ridge 
      
      

`Question 3`

`After building the model, you realised that the five most important predictor variables in the lasso model are not available in the incoming data. You will now have to create another model excluding the five most important predictor variables. Which are the five most important predictor variables now?`

In [None]:
X_train=X_train.drop(['Neighborhood_Crawfor','GrLivArea' , 'Neighborhood_NridgHt', 'OverallQual', 'Neighborhood_Somerst'],axis=1)
X_test=X_test.drop(['Neighborhood_Crawfor','GrLivArea' , 'Neighborhood_NridgHt', 'OverallQual', 'Neighborhood_Somerst'],axis=1)

In [None]:
lasso = Lasso()
# cross validation
model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

model_cv.fit(X_train, y_train) 


In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results[['param_alpha','params','mean_test_score','mean_train_score']]

In [None]:
# plotting mean test and train scoes with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')

# plotting
plt.figure(figsize=(10,8))
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
model_cv.best_params_

In [None]:
alpha = 0.0001
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train) 
y_pred_lasso_train=lasso.predict(X_train)
print('Train R2 Square : ',round(r2_score(y_train,y_pred_lasso_train),2))
y_pred_lasso_test=lasso.predict(X_test)
print('Test R2 Square : ',round(r2_score(y_test,y_pred_lasso_test),2))

In [None]:
model_parameters = list(lasso.coef_)
model_parameters.insert(0, lasso.intercept_)
model_parameters = [round(x, 3) for x in model_parameters]
cols = X.columns
cols = cols.insert(0, "constant")
var_coeff = list(zip(cols, model_parameters))
var_coeff = [x for x in var_coeff if abs(x[-1] != 0)]
df = pd.DataFrame.from_records(var_coeff, columns =['Features', 'B-Coeff'])
df['B-Coeff']=df['B-Coeff'].abs()
df=df.sort_values(by=['B-Coeff'],ascending=False)

In [None]:
print('Top 5 Predictor Variables using Lasso after Droping 5 Important Predictor Variables :',df[1:6].values)