In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox 
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from scipy.special import inv_boxcox
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Creating dataframe for numeric and categorical features separately 
numeric_df = df.select_dtypes(include=['int64', 'float64'])
categorical_df = df.select_dtypes(include=['object'])

In [None]:
numeric_df.columns

In [None]:
numeric_df.shape

In [None]:
categorical_df.columns

### EDA on Numerical features

#### HeatMap

In [None]:
plt.figure(figsize=(25,16))
sns.heatmap(numeric_df.corr(),annot=True)
plt.show()

Information:
* Some features have high correlation with each other:
1. GarageCars and GarageArea (0.88)
2. GarageYrBlt and YearBuilt (0.83)
3. TotRmsAbvGrd and GrLivArea (0.83)
4. TotalBsmtSF and 1stflrSF (0.82)
5. 1stflrSF and GrLivArea (0.69)
6. TotRmsAbvGrd and BedroomAbvGrd (0.68)
* Will drop one feature from each of these pair data visualization.

#### Univariate Analysis

In [None]:
fig=plt.subplots(figsize=(12, 21))
i=0
for feature in numeric_df.columns:
    if feature not in ['Id', 'SalePrice']:
        i+=1
        plt.subplot(13, 3, i)
        sns.distplot(df[feature])
        plt.tight_layout()

Information:
1. 'YearBuilt', 'OverallCond', 'OverallQual', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'FullBath', 'MoSold', 'Yrsold', 'BsmtQual', 'ExterCond','BsmtExposure', 'BsmtFinType2', 'TotRmsAbvGrd', 'GarageCond', 'GarageQual', 'KitchenAbvGr', 'LowQualFinSF', 'Fireplaces' are categorical.
2. Some features have majority 1 unique values: BsmtFinSF2, LowQualFinSF, KitchenAbvGr, EnclosedPorch, PoolArea, ScreenPorch, 3SsnPorch, MiscVal. These will create class imbalance so better to remove after EDA.

In [None]:
fig=plt.subplots(figsize=(12, 21))
i=0
for feature in numeric_df.columns:
    if feature not in ['Id', 'SalePrice']:
        i+=1
        plt.subplot(13, 3, i)
        sns.scatterplot(df[feature], df['SalePrice'])
        plt.tight_layout()

Information:
1. Mix features continuous and discrete.
2. Features like 'LotFrontage', 'LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea' are showing positive correlation with SalePrice.
3. MSSubClass, OverallQual, OverallCond, Bathroom related features, totalRoomsAbvGrd, MoSold, YrSold, GarageCars, KitchenAbvGrd has discrete values
4. Year realted features also show a positive correlation with SalePrice.

In [None]:
fig=plt.subplots(figsize=(12, 12))

for i, feature in enumerate(['MSSubClass','OverallQual','OverallCond','MoSold','YrSold','BsmtFullBath','FullBath','HalfBath','BedroomAbvGr','TotRmsAbvGrd','Fireplaces']):
    plt.subplot(6, 3, i+1)
    sns.barplot(df[feature], df['SalePrice'])
    plt.tight_layout()

Information:
1. 'OverallQual' : More the rating of this feature, more the SalePrice (target variable)
2. 'OverallCond' : SalePrice is highest for rating 5
3. 'MoSold' and 'YrSold': SalePrice does not show a strong trend depending on month and year on which realty is sold
4. 'FullBath' = 3 and 'HalfBath' = 1 have highest SalePrice
5. 'TotRmsAbvGrd' : More the number of total rooms above grade more the Sale Price but after a certain value it decreases.

In [None]:
# Converting the year related features into number of years
for feature in ['GarageYrBlt','YearBuilt','YearRemodAdd', 'YrSold']:
    df[feature] = 2022 - df[feature]

In [None]:
fig=plt.subplots(figsize=(12, 12))

for i, feature in enumerate(['GarageYrBlt','YearBuilt','YearRemodAdd', 'YrSold']):
    plt.subplot(4, 2, i+1)
    sns.scatterplot(df[feature], df['SalePrice'])
    plt.tight_layout()

Information:
1. For most the realty properties Garage is built within last 20-30 years, SalePrice is more for recently built garages
2. SalePrice is more for recently build houses
3. Recently remodelled houses (lower value of YearRemodAdd) have higher SalePrice
4. YrSold still does not show any significant trend

### EDA on Categorical columns

In [None]:
categorical_df.columns.sort_values()

In [None]:
# Dividing categorical features into sub classes
bsmt_df = categorical_df[['BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'BsmtQual']]
garage_df = categorical_df[['GarageCond','GarageFinish', 'GarageQual', 'GarageType']]
sale_df = categorical_df[['SaleCondition', 'SaleType']]
exterior_df = categorical_df[['ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd']]
heating_df = categorical_df[['Heating', 'HeatingQC','Electrical','FireplaceQu']]
land_df = categorical_df[['LandContour', 'LandSlope', 'Street', 'Alley','PavedDrive', 'Fence']]

#### Basement features

In [None]:
fig=plt.subplots(figsize=(10, 10))

for i,col in enumerate(bsmt_df.columns):
    plt.subplot(3, 2, i+1)
    sns.countplot(bsmt_df[col])
    plt.tight_layout()

Information:
1. BsmtCond in most houses are average with no basement exposure. 
2. Most hauses have 1 Good/ unfurnished basemet with second one as unfurnished. 
3. Basement quality has been mostly equal or above average.

In [None]:
fig=plt.subplots(figsize=(10,10))

for i,col in enumerate(bsmt_df.columns):
    plt.subplot(3, 2, i+1)
    sns.boxplot(bsmt_df[col], df['SalePrice'])
    plt.tight_layout()

Information:
1. Houses with good quality Bsmt Condition, Bsmt exposure, Bsmt Furnishing have high sale price . 
2. Houses with Excellent Basement height have high sale price.  

#### Garage features

In [None]:
fig=plt.subplots(figsize=(10,10))

for i,col in enumerate(garage_df.columns):
    plt.subplot(3, 2, i+1)
    sns.countplot(garage_df[col])
    plt.tight_layout()

Information:
1. Garage condition, garage quality in most houses are average with unfurnished garages. 
2. Most houses have attached garage type followed by deattached.

In [None]:
fig=plt.subplots(figsize=(10,10))

for i,col in enumerate(garage_df.columns):
    plt.subplot(3, 2, i+1)
    sns.boxplot(garage_df[col], df['SalePrice'])
    plt.tight_layout()

Information:
1. Houses with good quality Bsmt Condition, Bsmt exposure, Bsmt Furnishing have high sale price . 
2. Houses with Excellent Basement height have high sale price. 

#### Sale features

In [None]:
fig=plt.subplots(figsize=(10, 10))

for i,col in enumerate(sale_df.columns):
    plt.subplot(3, 2, i+1)
    sns.countplot(sale_df[col])
    plt.tight_layout()

Information:
1. Mostly houses have a normal sale condition, Sale type as WD:Warranty Deed - Conventional.

In [None]:
fig=plt.subplots(figsize=(10, 10))

for i,col in enumerate(sale_df.columns):
    plt.subplot(3, 2, i+1)
    sns.boxplot(sale_df[col], df['SalePrice'])
    plt.tight_layout()

Information:
1. Houses with Parial sale condition (new houses) and new Sale Type have high sale price.

#### Exterior features

In [None]:
fig=plt.subplots(figsize=(10, 10))

for i,col in enumerate(exterior_df.columns):
    plt.subplot(3, 2, i+1)
    sns.countplot(y=exterior_df[col])
    plt.tight_layout()

Information:
1. Most houses have average exterior conditions and quality.
2. Vinyl Siding is mostly used in houses

In [None]:
fig=plt.subplots(figsize=(12, 12))

for i,col in enumerate(exterior_df.columns):
    plt.subplot(3, 2, i+1)
    sns.boxplot(df['SalePrice'], exterior_df[col])
    plt.tight_layout()

Information:
1. Houses having excellent exterior conditions and quality have generally higher selling price.
2. Houses with Exterior covering made of Cement Board have high selling price.

#### Interior features

In [None]:
fig=plt.subplots(figsize=(12, 12))

for i,col in enumerate(heating_df.columns):
    plt.subplot(3, 2, i+1)
    sns.countplot(y = heating_df[col])
    plt.tight_layout()

Information:
1. Most houses have Gas forced warm air furnace and excellent heating quality with good fireplace.
2. Most houses have Standard Circuit Breakers.

In [None]:
fig=plt.subplots(figsize=(12, 12))

for i,col in enumerate(heating_df.columns):
    plt.subplot(3, 2, i+1)
    sns.boxplot(df['SalePrice'], heating_df[col])
    plt.tight_layout()

Information:
1. Houses having Gas hot water or steam heat and excellent heating quality have generally higher selling price.
2. Houses having Standard Circuit Breakers, good fireplace have higher selling price.

#### Land features

In [None]:
fig=plt.subplots(figsize=(10, 10))

for i,col in enumerate(land_df.columns):
    plt.subplot(3, 2, i+1)
    sns.countplot(y = land_df[col])
    plt.tight_layout()

Information:
1. Most houses have level land contours with gentle land slope. Streets having pave are highly common.
2. Paved drive are very common. Fences with min privacy are mostly there in houses.

In [None]:
fig=plt.subplots(figsize=(10, 10))

for i,col in enumerate(land_df.columns):
    plt.subplot(3, 2, i+1)
    sns.boxplot(land_df[col], df['SalePrice'] )
    plt.tight_layout()

Information:
1. Houses having level land contours and modelerate land slope have generally higher selling price.
2. Houses with street having pave and alley with pave have high selling price.
3. Houses having paved drive with good privacy have higher selling price.

#### Rest Features

In [None]:
fig=plt.subplots(figsize=(12, 20))
feats = ['Condition1','Condition2','Utilities','HouseStyle','Functional','Neighborhood', 'CentralAir', 'RoofStyle', 'RoofMatl','MiscFeature','Foundation','BldgType','MSZoning']
for i,col in enumerate(df[feats].columns):
    plt.subplot(5, 3, i+1)
    sns.countplot(y = df[col])
    plt.tight_layout()

Information:
1. Most houses have normal conditions with All public Utilities.
2. 1 story house style are most common . Most of them have central AC. 
3. Shed is most common misc feature in houses. Gable and CompShg are most common under roof style and matterial 
4. Most homes are located near Northwest Ames and College creek.

In [None]:
fig=plt.subplots(figsize=(12, 20))

for i,col in enumerate(df[feats].columns):
    plt.subplot(5, 3, i+1)
    sns.boxplot(df['SalePrice'], df[col])
    plt.tight_layout()

Information:
1. Houses having 2 Story with  have generally higher selling price.
2. Houses with central AC, hip style of roof with Wood Shingles have high selling price.
3. Houses with StoneBr, NridgHt as neighbourhood have high selling price.

### Featues selections and Handling missing values

#### Removing one of the high correlated from pairs checked using heatmap

In [None]:
df.drop(['GarageCars', 'GarageYrBlt', 'GrLivArea', 'TotalBsmtSF', 'BedroomAbvGr'], axis=1, inplace=True)
print(df.columns.shape)

#### Removing numerical features having 1 unique value as occuring more than 95%

In [None]:
# From EDA Removing numerical features having 1 unique value as occuring more than 99%
toDrop= ['BsmtFinSF2', 'LowQualFinSF', 'KitchenAbvGr', 'EnclosedPorch', '3SsnPorch','ScreenPorch','PoolArea','MiscVal']
for c in toDrop:
    if float(numeric_df[c].value_counts(1)[:1]) >= 0.99:
        df.drop(c, axis=1, inplace=True)
print(df.columns.shape)

#### Checking % of missing values

In [None]:
# Checking % of null values
for feat in df.columns:
    if df[feat].isnull().any():
        print(feat, ' : ',  round(df[feat].isnull().sum()/df.shape[0], 2)*100)

In [None]:
# Since MasVnrArea has only 1% data missing, dropping rows with NULL values in MasVnrArea
# Dropping Id column as it does not contribute towards predicting SalePrice
df = df[~df['MasVnrArea'].isnull()]
df.drop(['Id'], axis=1, inplace=True)

In [None]:
print(df['Electrical'].isnull().sum())

# dropping rows with null values in 'Electrical', for very low missing value count
df.dropna(subset=['Electrical'], inplace=True)

In [None]:
print(df['PoolQC'].value_counts())

# dropping 'PoolQC' for very high percentage of missing value and highly imbalance data (if missing value is imputed)
df.drop(['PoolQC'], axis=1, inplace=True)

In [None]:
# Some categorical features have NAN values which denotes to a particular class.
impute_cat_features = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'FireplaceQu' ,'Fence' ,'MiscFeature']
print(df[impute_cat_features].isnull().sum())
for feat in impute_cat_features:
    df[feat].fillna(value='NA_' + feat, inplace = True)

* 2 types of categorical features are present:
1. Encoding For Categorical Variables Ordered Features: to be label encoded: 'LotShape', 'Utilities', 'LandSlope', 'HouseStyle', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual','Functional','FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'CentralAir'

2. Unordered Features: to be one hot encoded: 'MSZoning', 'Street', 'Alley', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1' , 'Condition2', 'BldgType', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'GarageType','PavedDrive', 'Fence','MiscFeature', 'SaleType','SaleCondition'

#### Encoding Ordinal Categorical Features 

In [None]:
df['ExterQual'] = df['ExterQual'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df['ExterCond'] = df['ExterCond'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df['BsmtQual'] = df['BsmtQual'].map({'NA_BsmtQual':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
df['BsmtCond'] = df['BsmtCond'].map({'NA_BsmtCond':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
df['BsmtExposure'] = df['BsmtExposure'].map({'NA_BsmtExposure':0,'No':1,'Mn':2,'Av':3,'Gd':4})
df['BsmtFinType1'] = df['BsmtFinType1'].map({'NA_BsmtFinType1':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6})
df['BsmtFinType2'] = df['BsmtFinType2'].map({'NA_BsmtFinType2':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6})
df['HeatingQC'] = df['HeatingQC'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df['KitchenQual'] = df['KitchenQual'].map({'Po':0,'Fa':1,'TA':2,'Gd':3,'Ex':4})
df['GarageFinish'] = df['GarageFinish'].map({'NA_GarageFinish':0,'Unf':1,'RFn':2,'Fin':3})
df['GarageQual'] = df['GarageQual'].map({'NA_GarageQual':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
df['GarageCond'] = df['GarageCond'].map({'NA_GarageCond':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
df['FireplaceQu'] = df['FireplaceQu'].map({'NA_FireplaceQu':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})

* Information:
1. For 'Alley', Nan means 'No access to alley'
2. For 'BsmtQual', 'BsmtCond', BsmtExposure, BsmtFinType1, BsmtFinType2 Nan means 'No basement'
3. For GarageType, GarageFinish, GarageQual, GarageCond Nan means 'No garage'
4. For 'FireplaceQu' and 'Fence' Nan means 'No Fire place' and 'No fence' respectively
5. MiscFeature - Nan means no additional features mentioned.
* All these features can be imputed by making them into one category in place of missing data.

In [None]:
df.head()

#### One Hot Encoding on Nominal features

In [None]:
# One Hot Encoding on Nominal features
unordered_features = ['MSZoning', 'Street', 'Alley', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1' , 'Condition2',
                      'BldgType', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating',
                      'Electrical', 'GarageType','PavedDrive', 'Fence', 'MiscFeature', 'SaleType','SaleCondition','LotShape',
                      'Utilities', 'LandSlope', 'HouseStyle', 'CentralAir', 'Functional']
dummy_df = pd.get_dummies(df[unordered_features], drop_first=True)

In [None]:
dummy_df.shape

In [None]:
dummy_df.head()

In [None]:
# Dropping those classes which are present less than/equal to 1% of the observations.
dummy_cols_drop = []
for feat in dummy_df.columns:
    if dummy_df[feat].value_counts()[0]/dummy_df.shape[0] >= 0.98:
        dummy_cols_drop.append(feat)
    elif dummy_df[feat].value_counts()[1]/dummy_df.shape[1] >= 0.98:
        dummy_cols_drop.append(feat)
        
print(dummy_cols_drop)
print(len(dummy_cols_drop))

In [None]:
dummy_df.drop(columns = dummy_cols_drop, axis = 1, inplace = True)
dummy_df.shape

In [None]:
# Adding the dummy variables to the original dataframe
df = pd.concat([df,dummy_df],axis=1)

# Dropping the redundant columns
df = df.drop(unordered_features,axis=1)

In [None]:
df.shape

In [None]:
X = df.drop('SalePrice', axis = 1)

In [None]:
X.head()

In [None]:
y = df['SalePrice']
y.head()

In [None]:
sns.distplot(y)
print(y.skew())

#### Log transformation to removing skewness

In [None]:
y_bc = np.log(y)

In [None]:
sns.distplot(np.log(y))

#### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y_bc, test_size = 0.2, shuffle = True, random_state=100)

In [None]:
X_train.shape

In [None]:
X_train.head()

In [None]:
X_train.head()

#### Outlier Treatment

In [None]:
# Checking presence of outliers
outliers_percentage={}
numeric_df = X_train.select_dtypes(include=['int64', 'float64'])
cols = ['SalePrice','YearBuilt','OverallCond', 'OverallQual', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 
           'FullBath', 'MoSold', 'Yrsold', 'BsmtQual', 'ExterCond','BsmtExposure', 'BsmtFinType2','BsmtCond',
            'TotRmsAbvGrd','GarageCond','GarageQual', 'KitchenAbvGr', 'LowQualFinSF','Fireplaces']
for feature in numeric_df.columns:
    if feature not in cols:
        IQR=numeric_df[feature].quantile(.75)-numeric_df[feature].quantile(.25)
        outliers_count=numeric_df[(numeric_df[feature]>(numeric_df[feature].quantile(.75)+1.5*IQR)) | (numeric_df[feature]<(numeric_df[feature].quantile(.25)-1.5*IQR))].shape[0]
        outliers_percentage[feature]=round(outliers_count/numeric_df.shape[0]*100,2)

outlier_df=pd.DataFrame({'Features':list(outliers_percentage.keys()),'Percentage':list(outliers_percentage.values())})
outlier_df.sort_values(by="Percentage", ascending=False)


In [None]:
# Outlier Treatment
for feature, percent in outliers_percentage.items():
    if percent > 0:
        IQR = X_train[feature].quantile(.75) - X_train[feature].quantile(.25) 
        max_value = X_train[feature].quantile(.75)+1.5*IQR
        min_value = X_train[feature].quantile(.25)-1.5*IQR
        print(feature, IQR, min_value, max_value)
        X_train[feature][X_train[feature] > max_value] = max_value
        X_train[feature][X_train[feature] < min_value ] = min_value
        X_test[feature][X_test[feature] > max_value] = max_value
        X_test[feature][X_test[feature] < min_value ] = min_value

Information:
1. Many features have outliers
2. Dropping all the outliers will cause loss of information.
3. Hence reassigning fixed minimum and maximum values to those rows where feature value is outside the range of [25th percentile - 1.5 IQR, 75th percentile + 1.5 IQR]
4. IQR or Inter Quartile Range = Difference between 75th percentile and 25th percentile values of a feature.
5. Target column 'SalePrice' is excluded in this. Some other features are also excluded since those are ordered categorical type which are labelled encoded to numeric form.

In [None]:
#  Checking Null values
def null_values(dataf):
    for feat in dataf.columns:
        if dataf[feat].isnull().any():
            print(feat+" : "+str(dataf[feat].isnull().sum()))

In [None]:
null_values(X_train)
print("-----------")
null_values(X_test)

In [None]:
X_train['LotFrontage'].dtype

In [None]:
# Imputing missing values with mean of train data since outliers are already removed. 
imputed_value = np.mean(X_train['LotFrontage'])
print("Mean : ",imputed_value)
X_train['LotFrontage'].fillna(imputed_value, inplace = True)
X_test['LotFrontage'].fillna(imputed_value, inplace = True)

In [None]:
X_train.head()

In [None]:
X_test.head()

#### Scaling

In [None]:
sscaler = StandardScaler()
sscaler.fit(X_train)
X_train_sc = pd.DataFrame(data=sscaler.transform(X_train), columns=X_train.columns)
X_test_sc = pd.DataFrame(data=sscaler.transform(X_test), columns=X_test.columns)

In [None]:
# Folds for GridSearchCV
folds = KFold(n_splits = 4)

### Modelling, hyperparams tuning for ridge and lasso, evalution 

In [None]:
def ModelTrainAndEvaluaion(model, params):
    model_cv = GridSearchCV(estimator = model, 
                              param_grid = params, 
                              scoring= 'r2', 
                              cv = folds, 
                              return_train_score=True,
                              verbose = 1)            
    model_cv.fit(X_train_sc, y_train)
    alpha = model_cv.best_params_["alpha"]
    print("Optimum alpha for %s is %f" %(model, alpha))
    final_model = model_cv.best_estimator_

    final_model.fit(X_train_sc, y_train)
    y_train_pred = final_model.predict(X_train_sc)
    y_test_pred = final_model.predict(X_test_sc)
    
    # Model Evaluation
    print('R2 score (train) : ',round(r2_score(y_train,y_train_pred),4))
    print('R2 score (test) : ',round(r2_score(y_test,y_test_pred),4))
    print('RMSE (train) : ', round(np.sqrt(mean_squared_error(y_train, y_train_pred)),4))
    print('RMSE (test) : ', round(np.sqrt(mean_squared_error(y_test, y_test_pred)),4))
    fig, axes = plt.subplots(1, 3, figsize=(15,3))
    fig.suptitle('Assumtions of Linear Regression')
    sns.distplot(y_train -  y_train_pred, ax = axes[0])
    axes[0].set_title('Distribution of Residuals')
    sns.scatterplot(y_train_pred ,y_train, ax = axes[1])  
    axes[1].set_title('Plot of y vs y_predicted')
    sns.scatterplot(X_train.index, y_train-y_train_pred, ax = axes[2])
    axes[2].set_title('Homoscedasticity/ Variance')
    plt.tight_layout()
    
    cv_results = pd.DataFrame(model_cv.cv_results_)
    # plotting mean test and train scores with alpha 
    cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')

    # plotting
    fig = plt.figure(figsize=(7,3))
    plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
    plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
    plt.xlabel('alpha')
    plt.ylabel('r2')

    plt.title("r2 and alpha")
    plt.legend(['train score', 'test score'], loc='upper left')
    plt.show()
    return final_model

In [None]:
params = {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 
                        9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100,200]}
ridge = ModelTrainAndEvaluaion(Ridge(), params)

In [None]:
# Satisfying all the assumption of Linear Regression

In [None]:
params = {'alpha': [0.0001, 0.00025,  0.0005, 0.00065, 0.001, 0.00125, 0.0025,0.005, 0.01, 0.1, 1]}
lasso = ModelTrainAndEvaluaion(Lasso(), params)

In [None]:
# Satisfying all the assumption of Linear Regression

### Comparing Model Coefficients

In [None]:
model_coefficients = pd.DataFrame(index=X_test_sc.columns)
model_coefficients.rows = X_test_sc.columns

model_coefficients['Ridge'] = ridge.coef_
model_coefficients['Lasso'] = lasso.coef_
pd.set_option('display.max_rows', None)
print(len(model_coefficients))
model_coefficients

### Final Model 

In [None]:
# Features for final model. Choosing Lasso since it gives better r2 score and less RMSE.
print("Total no of features : ", model_coefficients[model_coefficients['Lasso']!=0][['Lasso']].shape[0])
model_coef = model_coefficients[model_coefficients['Lasso']!=0][['Lasso']].sort_values(by='Lasso', ascending=False, key=abs)
model_coef

In [None]:
# Top 10 features
TOP_10 = model_coef[:10]
TOP_10

In [None]:
# Predicted values of Dependent feature (Sale Price)
y_prediction = lasso.predict(X_test_sc)
y_test_pred_invbc = np.exp(y_prediction)
y_test_pred_invbc[:5]