### Importing the basic libraries

In [None]:
#importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

### loading the dataset

In [None]:
house= pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
house.head()

### Data understanding

In [None]:
house.shape

In [None]:
house.info()

In [None]:
house.describe()

In [None]:
#lets loook at the missing value percentage
round(100*house.isnull().sum()/len(house.index),2).sort_values(ascending=False)

In [None]:
#columns with morethan 45% missing value
house.columns[100*house.isnull().sum()/len(house.index)>45]

### Missing value treatment

In [None]:
# based on data dictionary na in PoolQC means 'No Pool'
house.loc[house['PoolQC'].isnull(),['PoolQC']] = 'No Pool'
# based on data dictionary na in Fence means 'No Fence'
house.loc[house['Fence'].isnull(),['Fence']] = 'No Fence'
# based on data dictionary na in MiscFeature means 'none'
house.loc[house['MiscFeature'].isnull(),['MiscFeature']] = 'none'

In [None]:
# based on data dictionary na in Alley means 'No alley access'
house.loc[house['Alley'].isnull(),['Alley']] = 'No alley access'
# based on data dictionary na in FireplaceQu means 'No Fireplace'
house.loc[house['FireplaceQu'].isnull(),['FireplaceQu']] = 'No Fireplace'

In [None]:
# based on data dictionary na in BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2 means ''No Basement'
house.loc[house['BsmtQual'].isnull(),['BsmtQual']] = 'No Basement'
house.loc[house['BsmtCond'].isnull(),['BsmtCond']] = 'No Basement'
house.loc[house['BsmtExposure'].isnull(),['BsmtExposure']] = "No Basement"
house.loc[house['BsmtFinType1'].isnull(),['BsmtFinType1']] = 'No Basement'
house.loc[house['BsmtFinType2'].isnull(),['BsmtFinType2']] = 'No Basement'

In [None]:
# based on data dictionary na in MasVnrType means 'none'
house.loc[house['MasVnrType'].isnull(),['MasVnrType']] = 'none'
# based on data dictionary where MasVnrType type is 'none' area will 0
house.loc[house['MasVnrArea'].isnull(),['MasVnrArea']] = 0

In [None]:
100*house['LotFrontage'].isnull().sum()/len(house.index)
#replacing the missing values with mean
house["LotFrontage"].replace(np.nan, house["LotFrontage"].mean(),inplace=True)

In [None]:
# based on data dictionary na in GarageType,GarageFinish,GarageQual,GarageCond means 'No Garage'
house.loc[house['GarageType'].isnull(),['GarageType']] = 'No Garage'
house.loc[house['GarageFinish'].isnull(),['GarageFinish']] = 'No Garage'
house.loc[house['GarageQual'].isnull(),['GarageQual']] = 'No Garage'
house.loc[house['GarageCond'].isnull(),['GarageCond']] = 'No Garage'

In [None]:
# replacing with mode value of the column
house.loc[house['Electrical'].isnull(),['Electrical']] = "SBrkr"

In [None]:
# impute GarageYrBlt with 2019 so that while calculating age it will it will turn to 0
# age = 2019-2019 = 0
house.loc[house['GarageYrBlt'].isnull(),['GarageYrBlt']] = 2019

In [None]:
#lets again look if there are any columns
house.columns[100*house.isnull().sum()/len(house.index)>0]

In [None]:
house.shape

In [None]:
#drop duplicates if any
house=house.drop_duplicates()
house.shape

In [None]:
house['SalePrice'].describe()

In [None]:
#lets drop the ID column as it is redundant in model builiding
house.drop(['Id'],axis=1,inplace=True)

In [None]:
##Derived variables from the dataset
#New variable creation TotalSF i.e combination of TotalBsmtSF, 1stFlrSF, 2ndFlrSF
house['TotalSF'] = house['TotalBsmtSF'] + house['1stFlrSF'] + house['2ndFlrSF']
#house["house_age_when_sold_in_month"] = (((house["YrSold"]-1) - house["YearBuilt"])*12) + house["MoSold"]
# derive house age = 2019 - year build
house["house_age"] = 2019 - house["YearBuilt"]
# derive garage age = 2019 - year build
house["garage_age"] = 2019 - house["GarageYrBlt"]
# derive gap b/w house build and remodel = YearRemodAdd - year build
house["gap_between_build_remodel"] = house["YearRemodAdd"] - house["YearBuilt"]

In [None]:
#converting to correct datatype for some variables
house['MSSubClass'] = house['MSSubClass'].astype('object')
house['OverallCond'] = house['OverallCond'].astype('object')
house['YrSold'] = house['YrSold'].astype('object')
house['MoSold'] = house['MoSold'].astype('object')

In [None]:
#dividing the variables to numeric and categorical
house_numeric=house.select_dtypes(include=['float64','int64'])
house_numeric.head()

In [None]:
house_numeric.columns

In [None]:
house_categorical=house.select_dtypes(include=['object'])
house_categorical.columns

In [None]:
print(len(house_categorical.columns))
print(len(house_numeric.columns))

In [None]:
# correlation matrix
corr = house_numeric.corr()
corr

In [None]:
# plotting a heatmap
plt.figure(figsize = (20, 15))
# heatmap
sns.heatmap(corr, cmap="coolwarm", annot=True)
plt.show()

In [None]:
#plotting scatter plot for some of the numeric variables
sns.set()
plt.figure(figsize=(40, 30))
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt',
        'KitchenAbvGr','Fireplaces','WoodDeckSF','PoolArea','TotalSF']
sns.pairplot(house[cols], size = 2.5)
plt.show();

### Univariate analysis

In [None]:
#distplot of the target variables
sns.distplot(house['SalePrice'])

In [None]:
#These are the columns form the above distplot that doesnot follow normal distribution, lets drop few of them
#'GarageYrBlt','YearBuilt','YrSold','YearRemodAdd','MoSold','2ndFlrSF','BsmtFullBath','FullBath','HalfBath','Fireplaces','GarageCars'
house_numeric.drop(['YearBuilt', 'YearRemodAdd','Fireplaces'], axis=1,inplace=True)
house_numeric.head()

In [None]:
house_numeric.shape

### Outlier Analysis

In [None]:
#boxplots for numeric varaibles
plt.figure(figsize=(24, 12))
plt.subplot(3,3,1)
sns.violinplot(house.LotFrontage, fill='#A4A4A4', color="red")
plt.subplot(3,3,2)
sns.violinplot(house.LotArea, fill='#A4A4A4', color="red")
plt.subplot(3,3,3)
sns.violinplot(house.MasVnrArea, fill='#A4A4A4', color="red")
plt.subplot(3,3,4)
sns.violinplot(house.BsmtUnfSF, fill='#A4A4A4', color="red")
plt.subplot(3,3,5)
sns.violinplot(house.TotalSF, fill='#A4A4A4', color="red")
plt.subplot(3,3,6)
sns.violinplot(house['1stFlrSF'], fill='#A4A4A4', color="red")
plt.subplot(3,3,7)
sns.violinplot(house['2ndFlrSF'], fill='#A4A4A4', color="red")
plt.subplot(3,3,8)
sns.violinplot(house.LowQualFinSF, fill='#A4A4A4', color="red")
plt.subplot(3,3,9)
sns.violinplot(house.GrLivArea, fill='#A4A4A4', color="red")
plt.show()

In [None]:
#boxplots for numeric variables
plt.figure(figsize=(24, 12))
plt.subplot(3,3,1)
sns.violinplot(house.GrLivArea, fill='#A4A4A4', color="red")
plt.subplot(3,3,2)
sns.violinplot(house.TotRmsAbvGrd, fill='#A4A4A4', color="red")
plt.subplot(3,3,3)
sns.violinplot(house.house_age, fill='#A4A4A4', color="red")
plt.subplot(3,3,4)
sns.violinplot(house.garage_age, fill='#A4A4A4', color="red")
plt.subplot(3,3,5)
sns.violinplot(house.PoolArea, fill='#A4A4A4', color="red")
plt.subplot(3,3,6)
sns.violinplot(house.MiscVal, fill='#A4A4A4', color="red")
plt.subplot(3,3,7)
sns.violinplot(house.EnclosedPorch, fill='#A4A4A4', color="red")
plt.subplot(3,3,8)
sns.violinplot(house.GarageArea, fill='#A4A4A4', color="red")
plt.subplot(3,3,9)
sns.violinplot(house.SalePrice, fill='#A4A4A4', color="red")
plt.show()

In [None]:
#since data available is very less we will treat the outliers only for few variables
# outlier treatment for LotFrontage
Q1 = house.LotFrontage.quantile(0.25)
Q3 = house.LotFrontage.quantile(0.75)
IQR = Q3 - Q1
house = house[(house.LotFrontage >= Q1 - 1.5*IQR) & (house.LotFrontage <= Q3 + 1.5*IQR)]

In [None]:
# outlier treatment for LotArea
Q1 = house.LotArea.quantile(0.25)
Q3 = house.LotArea.quantile(0.75)
IQR = Q3 - Q1
house = house[(house.LotArea >= Q1 - 1.5*IQR) & (house.LotArea <= Q3 + 1.5*IQR)]

In [None]:
Q1 = house.PoolArea.quantile(0.25)
Q3 = house.PoolArea.quantile(0.75)
IQR = Q3 - Q1
house = house[(house.PoolArea >= Q1 - 1.5*IQR) & (house.PoolArea <= Q3 + 1.5*IQR)]

In [None]:
Q1 = house.MiscVal.quantile(0.25)
Q3 = house.MiscVal.quantile(0.75)
IQR = Q3 - Q1
house = house[(house.MiscVal >= Q1 - 1.5*IQR) & (house.MiscVal <= Q3 + 1.5*IQR)]

In [None]:
house.shape

In [None]:
# split into X and y
X = house.drop(['SalePrice'], axis=1)
y = house['SalePrice']

### Converting the categorical columns by creating dummy variables

In [None]:
#replacing with binary values 
house["CentralAir"]=house["CentralAir"].map({'Y': 1, "N": 0})

In [None]:
#lets include the categorical columns
house_categorical_df=X.select_dtypes(include=['object'])

In [None]:
house_categorical_df.columns

In [None]:
#creating dummy variables for categorical columns
house_df_dummies = pd.get_dummies(house_categorical_df, drop_first=True)
house_df_dummies.head()

In [None]:
# dropping the original categorical variables after creating dummies
X = X.drop(list(house_categorical_df.columns), axis=1)

In [None]:
# concat dummy variables with X
X = pd.concat([X, house_df_dummies], axis=1)

In [None]:
X.shape

In [None]:
#scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scale_var = X.columns
X[scale_var] = scaler.fit_transform(X[scale_var])

In [None]:
X.describe()

### Model Builiding

In [None]:
# split data into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.7, 
                                                    test_size = 0.3, 
                                                    random_state = 100)

In [None]:
# let's build a Linear regression model first
from sklearn import metrics
# linear regression
lm = LinearRegression()
lm.fit(X_train, y_train)

# predict
y_train_pred = lm.predict(X_train)
metrics.r2_score(y_true=y_train, y_pred=y_train_pred)

In [None]:
y_test_pred = lm.predict(X_test)
metrics.r2_score(y_true=y_test, y_pred=y_test_pred)

In [None]:
lm.intercept_

In [None]:
# model coefficients
model_parameters = list(lm.coef_)
model_parameters.insert(0, lm.intercept_)
model_parameters = [round(x,3) for x in model_parameters]
cols = X.columns
cols = cols.insert(0, "constant")
list(zip(cols, model_parameters))

It is clearly visible that the model is overfit since the tarining accracy is very high compared to test data

In [None]:
#Importing the lasso and ridge regressions
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [None]:
# lasso regression without tuning hyper parameter
lm = Lasso(alpha=0.001)
lm.fit(X_train, y_train)

# predict
y_train_pred = lm.predict(X_train)
print(metrics.r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = lm.predict(X_test)
print(metrics.r2_score(y_true=y_test, y_pred=y_test_pred))

In [None]:
# lasso model parameters without tuning hyper parameter
model_parameters = list(lm.coef_)
model_parameters.insert(0, lm.intercept_)
model_parameters = [round(x,3) for x in model_parameters]
cols = X.columns
cols = cols.insert(0, "constant")
list(zip(cols, model_parameters))

lets tune the hyperparameter

In [None]:
# Now let's do k fold with r2 score and tune hyper parameter
from sklearn.model_selection import KFold

folds = KFold(n_splits = 5, shuffle = True, random_state = 4)

# specify range of hyperparameters
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 200]}

model = Lasso()
model_cv = GridSearchCV(estimator = model, param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True, verbose = 1)            
model_cv.fit(X_train, y_train)

In [None]:
# results data frame 
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
model_cv.best_params_

In [None]:
#plotting the values of r2 score to choose the optimal value of alpha
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('r2 score')
plt.title("r2 score and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
# Now let's do k fold with neg_mean_absolute_error score and tune hyper parameter
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 200]}
lasso = Lasso()

model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

model_cv.fit(X_train, y_train)

In [None]:
# results data frame
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
model_cv.best_params_

In [None]:
# plotting mean test and train scoes with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
# model with optimal alpha = 200
lm = Lasso(alpha=200)
lm.fit(X_train, y_train)


# predict
y_train_pred = lm.predict(X_train)
print(metrics.r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = lm.predict(X_test)
print(metrics.r2_score(y_true=y_test, y_pred=y_test_pred))

In [None]:
# lasso model parameters with alpha = 200
model_parameters = list(lm.coef_)
model_parameters.insert(0, lm.intercept_)
model_parameters = [round(x, 3) for x in model_parameters]
cols = X.columns
cols = cols.insert(0, "constant")
list(zip(cols, model_parameters))
final_var=dict(zip(cols, model_parameters))
{k: v for k, v in sorted(final_var.items(), key=lambda item: item[1])}

With optimal value of alpha=200 we got r2 values for training and test data as 0.94 and 0.91 which is good indication that model is not overfit

### Ridge regression

In [None]:
# ridge regression without tuning hyper parameter
lm = Ridge(alpha=0.001)
lm.fit(X_train, y_train)

# predict
y_train_pred = lm.predict(X_train)
print(metrics.r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = lm.predict(X_test)
print(metrics.r2_score(y_true=y_test, y_pred=y_test_pred))

In [None]:
# Now let's do k fold with r2 score and tune hyper parameter
params = {'alpha': [0.001, 0.01, 1.0, 5.0, 10.0]}

ridge = Ridge()
folds = 5
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train)

In [None]:
# results data frame
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
model_cv.best_params_

In [None]:
# plotting mean test and train scoes with alpha to tune hyper parameter
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('r2 score')
plt.title("r2 score and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
# Now let's do k fold with neg_mean_absolute_error score and tune hyper parameter
params = {'alpha': [0.001, 0.01, 1.0, 5.0, 10.0]}

ridge = Ridge()
folds = 5
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train)

In [None]:
# results data frame
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
model_cv.best_params_

In [None]:
# plotting mean test and train scoes with alpha to tune hyper parameter
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
# model with optimal alpha = 10
# ridge regression
lm = Ridge(alpha=10)
lm.fit(X_train, y_train)
y_train_pred = lm.predict(X_train)
print(metrics.r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = lm.predict(X_test)
print(metrics.r2_score(y_true=y_test, y_pred=y_test_pred))

Builing the Ridge regression after variable shrinkage done by lasso

In [None]:
# Only the variables with no zero coefficient values are selected
X_new=cols.drop(['LotFrontage','BsmtUnfSF','1stFlrSF','BsmtFullBath','GarageYrBlt','GarageArea','PoolArea','MiscVal','gap_between_build_remodel',
          'MSSubClass_50','MSSubClass_60','MSSubClass_190','Alley_Pave','Neighborhood_MeadowV','Neighborhood_SWISU','Neighborhood_SawyerW','Condition2_Norm',
          'Condition2_RRAn','Condition2_PosA','BldgType_TwnhsE','HouseStyle_1Story','HouseStyle_SLvl','OverallCond_6','RoofStyle_Hip','RoofMatl_Roll','Exterior1st_AsphShn','Exterior1st_Stucco',
'Exterior2nd_HdBoard', 'Exterior2nd_MetalSd','Exterior2nd_Plywood','MasVnrType_none','ExterCond_TA','BsmtQual_No Basement','BsmtCond_Gd','BsmtCond_Po','BsmtCond_No Basement','BsmtExposure_No Basement','BsmtFinType1_No Basement','BsmtFinType1_No Basement',
'BsmtFinType2_GLQ','BsmtFinType2_No Basement','Functional_Min1','FireplaceQu_Gd','GarageType_Attchd','GarageType_CarPort','GarageFinish_No Garage','GarageQual_Gd','GarageQual_No Garage','GarageCond_No Garage','MiscFeature_Shed','MiscFeature_none','SaleType_ConLD','SaleType_ConLw','SaleType_New'     ])

In [None]:
#lets look at the length of the variables after elimination by lasso regression
len(X_new)

In [None]:
#lets move the varibles in to a list
reduced_cols=list(X_new)
#removing the constant 
reduced_cols.remove('constant')

In [None]:
#creating the dataframe of the varibles extracted above
X_NEW=X[reduced_cols]

In [None]:
X_NEW.head()

In [None]:
# Split test and train again using new X
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_NEW, y, train_size=0.7,
                                                    test_size = 0.3, 
                                                    random_state = 100)

In [None]:
# list of alphas to tune
params = {'alpha': [0.001, 0.01, 1.0, 5.0, 10.0]}

ridge = Ridge()

# cross validation
folds = 5
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train)

In [None]:
# results data frame
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
model_cv.best_params_

In [None]:
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('r2 score')
plt.title("r2 score and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
# Now let's do k fold with neg_mean_absolute_error score and tune hyper parameter
params = {'alpha': [0.001, 0.01, 1.0, 5.0, 10.0]}

ridge = Ridge()

# cross validation
folds = 5
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(X_train, y_train)

In [None]:
# results data frame
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
model_cv.best_params_

In [None]:
# plotting mean test and train scoes with alpha to tune hyper parameter
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('Negative Mean Absolute Error')
plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
# model with optimal alpha = 10
lm = Ridge(alpha=10)
lm.fit(X_train, y_train)

# predict
y_train_pred = lm.predict(X_train)
print(metrics.r2_score(y_true=y_train, y_pred=y_train_pred))
y_test_pred = lm.predict(X_test)
print(metrics.r2_score(y_true=y_test, y_pred=y_test_pred))

In [None]:
#lets create a dictionary with coeffiecients
model_parameters = list(lm.coef_)
model_parameters.insert(0, lm.intercept_)
model_parameters = [round(x, 3) for x in model_parameters]
cols = X.columns
cols = cols.insert(0, "constant")
final_var=dict(zip(cols, model_parameters))

In [None]:
#sort the dict
{k: v for k, v in sorted(final_var.items(), key=lambda item: item[1])}

#### These are the top features in the ridge regression after using the non zero coefficient variables from the lasso regression.

GrLivArea,
LotFrontage,
OverallQual,
LotArea,
GarageArea,
Neighborhood_Crawfor,
MSSubClass_45,
MasVnrArea,
1stFlrSF,
ExterQual,
Exterior1st_CBlock,
Exterior1st_CemntBd,
Foundation_Stone,
Foundation_Wood,
Exterior1st_WdShing,
WoodDeckSF,
Exterior2nd_AsphShn,
GarageType_Basment,
Exterior2nd_BrkFace,


With optimal value of alpha = 10 for ridge regression on variables selected by lasso regession, we got following score for train and test data set. R2 for train : 0.9445110204167058 R2 for test : 0.9132677230470666 

This score looks good and we can conclude that model is not overfitting. And this score is pretty close to previous ridge regression model.

Optimal value of alpha of lasso regression is 200 and r2 score for optimal value of alpha is given below R2 score for train : 0.941828259003052 R2 score for test : 0.9194833870877548

Optimal value of alpha of ridge regression is 10 and r2 score for optimal value of alpha is given below R2 score for train : 0.9464820405083926 R2 score for test : 0.9075797172829855 

Optimal value of alpha is 10 for ridge regression on variables selected by lasso regession and r2 score for optimal value of alpha is given below R2 score for train : 0.9445110204167058 R2 score for test : 0.9132677230470666 Lasso has successfully reduced variables by shrinking the variable coefficient to 0.


In [None]:
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice':final_var})
my_submission.to_csv('submission.csv', index=False)