In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.gridspec as gridspec
from scipy import stats
import matplotlib.style as style
import math

%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.linear_model import LinearRegression
from sklearn import ensemble, tree, linear_model
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
import missingno as msno

#Model Train
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from lightgbm import LGBMRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.listdir('/kaggle/input/house-prices-advanced-regression-techniques')

# ðŸ“– Read Files 

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# **ðŸ“Š A.Exploratory Data Analysis**

## 1. General Data Statistics

In [None]:
train.shape, test.shape

In [None]:
train.info()

In [None]:
train.isnull().sum().sum()

In [None]:
train.describe()

In [None]:
train.describe(include='O')

## 2. Value counts for discrete and categorical features

In [None]:
def column_unique(col_list):
    for column_name in train.columns:
        if train[column_name].nunique() < 35 and train[column_name].dtypes == 'int64':
            unique_category = len(train[column_name].unique())
            print(f'Feature {column_name} with dtype discrete has {unique_category} unique categories')
        elif train[column_name].dtypes == 'object':
            unique_category = len(train[column_name].unique())
            print(f'Feature {column_name} with dtype object has {unique_category} unique categories')
        else:
            dtype = train[column_name].dtypes
            print(f'Feature {column_name} is of dtype {dtype}')

In [None]:
column_unique(train.columns)

In [None]:
# for column_name in train.columns:
#     if train[column_name].nunique() < 35 and train[column_name].dtypes == 'int64':
#         unique_category = len(train[column_name].unique())
#         print("Feature '{column_name}' has '{unique_category}' unique categories".format(column_name = column_name,
#                                                                                          unique_category=unique_category))
# for column_name in train.columns:
#     if train[column_name].dtypes == 'object':
#         unique_category = len(train[column_name].unique())
#         print("Feature '{column_name}' has '{unique_category}' unique categories".format(column_name = column_name,
#                                                                                          unique_category=unique_category))
# for column_name in test.columns:
#     if test[column_name].dtypes == 'object':
#         unique_category = len(test[column_name].unique())
#         print("Features in test set '{column_name}' has '{unique_category}' unique categories".format(column_name = column_name, unique_category=unique_category))

In [None]:
# Making Lists of different dataframe types, categorising by categorical, discrete and numerical 
# functions takes dataframe as an input and returns three lists of each type
# access using indexes
# made it just for fun

# def feature_type_identifier(df):
#     cat_cols = train.select_dtypes('object').columns
#     dis_cols = [feature for feature in train.columns if train[feature].nunique() < 25 and train[feature].dtypes == 'int64' ]
#     num_cols = [feature for feature in train.columns if train[feature].nunique() > 25]
#     return cat_cols, dis_cols, num_cols

## 3. Create dtype lists

In [None]:
cat_cols = list(train.select_dtypes('object').columns)
dis_cols = [feature for feature in train.columns if train[feature].nunique() < 25 and train[feature].dtypes == 'int64' ]
num_cols = [feature for feature in train.columns if train[feature].nunique() > 25]

In [None]:
train[cat_cols].isnull().sum()

## 4. Missing Data Assessment

In [None]:
def missing_data(df):
    total = df.isnull().sum()
    percent = round(df.isnull().sum() / df.shape[0]* 100)
    missing_info = pd.concat([total, percent], axis = 1, keys=['Total', 'Percent']).sort_values(by='Percent', ascending=False)
    missing_info = missing_info[missing_info['Total'] > 0]
    return missing_info

In [None]:
missing_data(train)

In [None]:
msno.matrix(train)

In [None]:
msno.bar(train)

In [None]:
msno.heatmap(train)

## 5. Univariate Analysis

Focusing on Sales Price. Need to check if it follows a normal distribution so that we can easily fit into a multi regression model.

In [None]:
def plotting_charts_hist_qq_boxp(df, feature):
    style.use('fivethirtyeight')
    fig = plt.figure(constrained_layout=True, figsize= (12,8))
    grid = gridspec.GridSpec(ncols=3, nrows=3, figure=fig)
    ax1 = fig.add_subplot(grid[0,:2])
    # Histogram
    ax1.set_title('Histogram')
    sns.distplot(df[feature], norm_hist=True, ax=ax1)
    # QQ Plot
    ax2 = fig.add_subplot(grid[1, :2])
    ax2.set_title('QQ_PLOT')
    stats.probplot(df[feature], plot= ax2)
    ## Boxplot
    ax3 = fig.add_subplot(grid[:,2])
    ax3.set_title('Box Plot')
    sns.boxplot(df[feature], orient='v', ax =ax3)
plotting_charts_hist_qq_boxp(train, 'SalePrice')

### Observations
1. Target variable is not normally distributed
2. Target Variable is highly right-skewed.
3. There are outliers too.

Note to Self : SalePrice needs tranformation

In [None]:
# Skewness
print('Skewness: {}'.format(train['SalePrice'].skew()))
print('Kurtosis: {}'.format(train['SalePrice'].kurt()))

## 6. Bivariate Analaysis

In [None]:
train.corr()['SalePrice'].sort_values(ascending = False)

### 6A. Numerical Features Bivariate Analysis

In [None]:
# Visualizing Outliers
# style.use('ggplot')
fig, axes = plt.subplots(ncols=2, nrows=0, figsize =(12,120))
# sns.color_palette("husl", 8)
plt.subplots_adjust(right=2)
# plt.subplots_adjust(top=2)
plot_list = (x for x in num_cols if x not in ['Id' , 'SalePrice'])
for i, feature in enumerate(plot_list, 1):
    plt.subplot(len(num_cols), 3,i)
    sns.scatterplot(x = 'SalePrice', y = feature, data=train)
    plt.ylabel('{}'.format(feature), size=15)
    plt.xlabel('SalePrice', size=15)
plt.show()

In [None]:
# Correlation between SalePrice and numerical features
train.corr()['SalePrice'][num_cols].sort_values(ascending = False)

In [None]:
# Scatter Plot
def scatter_plotter(indep, dep):
#     style.use('ggplot')
    plt.subplots(figsize=(8,6))
    sns.scatterplot(x = indep, y = dep)

In [None]:
scatter_plotter(train.GrLivArea, train.SalePrice)
train.corr()['SalePrice']['GrLivArea']

Two distant values for GrLivArea showing outlier nature, should be deleted!

In [None]:
scatter_plotter(train.GarageArea, train.SalePrice)
train.corr()['SalePrice']['GarageArea']

In [None]:
scatter_plotter(train.TotalBsmtSF, train.SalePrice)
train.corr()['SalePrice']['TotalBsmtSF']

### Observations from above exercise:
1. Our target variable shows an unequal level of variance across most predictor variables (for GrLivArea, TotalBsmtSF, 1stFloorSF, MasVnrArea). This phenomenon is called Heteroscedasticity and is a red flag for multiple linear regression. Will make the variable homoscedastic later on in df manipulation part.
2. Outliers are present in GrLivArea, TotalBsmtSF, 1stFloorSF, MasVnrArea and should be handled.
3. SalePrice vs. GrLivArea seem to follow a trend, which can be explained by saying that "As the prices increased, so did the area.
4. Will remove outliers from GrLivArea later in dataframe manipulation part.

Assumptions of Linear Regression
1. Linearity 
2. Homoscedasticity
3. Independence Errors
4. Multivariate Normality
5. No or little multicollinearity

## Check for Regression Assumptions
By fitting a Regression Line

In [None]:
fig, (ax1,ax2) = plt.subplots(figsize = (8,6), ncols=2, sharey=False)
sns.scatterplot(train.GrLivArea, train.SalePrice, ax=ax1)
sns.regplot(train.GrLivArea, train.SalePrice, ax=ax1)
sns.scatterplot(train.MasVnrArea, train.SalePrice, ax=ax2)
sns.regplot(train.MasVnrArea, train.SalePrice, ax=ax2)

In [None]:
sns.residplot(train.MasVnrArea, train.SalePrice)

### 6B. Discrete + Categorical Features Bivariate Analysis

In [None]:
train.corr()['SalePrice'][dis_cols].sort_values(ascending=False)

In [None]:

# for feature in dis_cols:
#     data=train.copy()
#     data.groupby(feature)['SalePrice'].median().plot.bar()
#     plt.xlabel(feature)
#     plt.ylabel('SalePrice')
#     plt.title(feature)
#     plt.show()

In [None]:
# CountPlots

fig, axes = plt.subplots(ncols=2, nrows=0, figsize =(12,240))
sns.color_palette("husl", 8)
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)

for i, feature in enumerate(dis_cols+cat_cols, 1):
    plt.subplot(len(dis_cols+cat_cols), 3,i)
    sns.countplot(train[feature])
    plt.xlabel(f'{feature}', size=15)
    plt.ylabel('Count', size=15)
plt.show()

In [None]:
# Categorised Mean/Median

fig, axes = plt.subplots(ncols=2, nrows=0, figsize =(12,120))
sns.color_palette("husl", 8)
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)

for i, feature in enumerate(dis_cols+cat_cols, 1):
    plt.subplot(len(dis_cols+cat_cols), 3,i)
    train.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(f'{feature}', size=15)
    plt.ylabel('Count', size=15)
plt.show()

In [None]:
# Boxplots

fig, axes = plt.subplots(ncols=2, nrows=0, figsize =(12,120))
sns.color_palette("husl", 8)
plt.subplots_adjust(right=2)
plt.subplots_adjust(top=2)

for i, feature in enumerate(dis_cols+cat_cols, 1):
    plt.subplot(len(dis_cols+cat_cols), 3,i)
    sns.boxplot(y = 'SalePrice', x = feature, data= train)
    plt.xlabel(f'{feature}', size=15)
    plt.ylabel('Count', size=15)
plt.show()

In [None]:
object_df = pd.concat([train[cat_cols], train['SalePrice']], axis=1)

In [None]:
ix=1
fig = plt.figure(figsize = (8,6))
for c in list(object_df.columns):
    if ix <= 3:
        if c != 'SalePrice':
            ax1 = fig.add_subplot(2,3,ix)
            sns.countplot(data = object_df, x=c, ax = ax1)
            ax2 = fig.add_subplot(2,3,ix+3)
            sns.boxplot(data=object_df, x=c, y='SalePrice', ax=ax2)
#             sns.violinplot(data=object_df, x=c, y='SalePrice', ax=ax2)
#             sns.swarmplot(data = object_df, x=c, y ='SalePrice', color = 'k', alpha = 0.4, ax=ax2)
           
    ix = ix +1
    if ix == 4:
        fig= plt.figure(figsize = (8,6))
        ix =1

In [None]:
# discrete_feature=[feature for feature in numeric_features if len(train[feature].unique())<25 and feature not in year_feature+['Id']]
# print("Discrete Variables Count: {}".format(len(discrete_feature)))

## 7. Multivariate Analysis

In [None]:
# HeatMap
style.use('ggplot')
sns.set_style('whitegrid')
plt.subplots(figsize = (30,20))
## Plotting heatmap. 

# Generate a mask for the upper triangle (taken from seaborn example gallery)
mask = np.zeros_like(train.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


sns.heatmap(train.corr(), 
            cmap=sns.diverging_palette(20, 220, n=200), 
            mask = mask, 
            annot=True, 
            center = 0, 
           );
## Give title. 
plt.title("Heatmap of all the Features", fontsize = 30);

What heatmap gave me:
 1. 1stFloorSF and TotalBsmtSF are highly correlated, both features depicts the same attribute basically. Take only one forward.
 2. GarageArea and GarageCars are highly correlated, both shows the same feature about the house. Only take one forword.
 3. TotRmAbvGrd and GrLivArea are highly correlated.
 

# B. Feature Engineering

In [None]:
train_v1 = train.copy()

In [None]:
# train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
# train.drop(train[(train['GrLivArea']>4500) & (train['SalePrice']<300000)].index, inplace=True)
# train.reset_index(drop=True, inplace=True)

In [None]:
train['SalePrice'] = np.log1p(train['SalePrice'])

In [None]:
plotting_charts_hist_qq_boxp(train, 'SalePrice')

There you go, SalePrice looking off the charts. (Pun Intended)

In [None]:
y = train['SalePrice'].reset_index(drop=True)
## Remove Id and save target variable as y
train = train.drop(['Id', 'SalePrice'], axis=1)
test = test.drop(['Id'], axis=1)

In [None]:
## Combining train and test datasets together so that we can do all the work at once. 
all_data = pd.concat((train, test)).reset_index(drop = True)

In [None]:
all_data.shape

# Handling Missing Values
Will be using all_data from here on

In [None]:
missing_data(all_data)

In [None]:
missing_data(all_data)['Percent'].plot.bar(color="b")

In [None]:
for feature in ['MoSold', 'YrSold', 'MSSubClass']:
    all_data[feature] = all_data[feature].apply(str)

In [None]:
all_data[['MoSold', 'YrSold', 'MSSubClass']].info()

In [None]:
# Assume typical unless deductions are warranted (from the data description)
all_data['Functional'] = all_data['Functional'].fillna('Typ')
# Fillna with modes as these columns has very less missing data
mode_feats = list(missing_data(all_data[cat_cols])[missing_data(all_data[cat_cols])['Total'] <2].index)
for feature in mode_feats:
    all_data[feature] = all_data[feature].fillna(all_data[feature].mode()[0])

In [None]:
missing_data(all_data[['Exterior2nd', 'Exterior1st', 'Electrical', 'KitchenQual', 'SaleType']])

In [None]:
all_data['MSZoning'] = all_data.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

In [None]:
## Some missing values are intentionally left blank, for example: In the Alley feature 
## there are blank values meaning that there are no alley's in that specific house. 
none_available = [ "Alley", 
                   "PoolQC", 
                   "MiscFeature",
                   "Fence",
                   "FireplaceQu",
                   "GarageType",
                   "GarageFinish",
                   "GarageQual",
                   "GarageCond",
                   'BsmtQual',
                   'BsmtCond',
                   'BsmtExposure',
                   'BsmtFinType1',
                   'BsmtFinType2',
                   'MasVnrType']

for feature in none_available:
    all_data[feature] = all_data[feature].fillna('None')

In [None]:
none_available2 =  ['BsmtFinSF1',
                    'BsmtFinSF2',
                    'BsmtUnfSF',
                    'TotalBsmtSF',
                    'BsmtFullBath', 
                    'BsmtHalfBath', 
                    'GarageYrBlt',
                    'GarageArea',
                    'GarageCars',
                    'MasVnrArea']

for feature in none_available2:
    all_data[feature] = all_data[feature].fillna(0)

In [None]:
## Replaced all missing values in LotFrontage by imputing the median value of each neighborhood. 
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform( lambda x: x.fillna(x.mean()))

In [None]:
all_data[all_data['Utilities'].isnull()]

In [None]:
all_data.Utilities.value_counts()

In [None]:
 missing_data(all_data)

In [None]:
all_data['Utilities'] = all_data['Utilities'].fillna(all_data['Utilities'].mode()[0])

In [None]:
missing_data(all_data)

## Fixing Skewness

In [None]:
plot_list

In [None]:
for x in all_data.columns:
    if all_data[x].dtype in ('int64', 'float64') and x not in ('Id', 'SalePrice'):
        print(all_data[x].dtype)
        print(x)

In [None]:
plot_list = [x for x in all_data.columns if all_data[x].dtype in ('int64', 'float64')]

In [None]:
# Create box plots for all numeric features
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=all_data[plot_list] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)

In [None]:
len(all_data.dtypes[all_data.dtypes != 'object'].index)

In [None]:
skew_cols = all_data.dtypes[all_data.dtypes != 'object'].index
skewness = all_data[skew_cols].apply(lambda x: skew(x)).sort_values(ascending =False)
skewness = skewness[abs(skewness) > 0.5]
high_skew = pd.DataFrame({'Skew' : skewness })
high_skew_cols = high_skew.index

In [None]:
skew(all_data.YearBuilt)

In [None]:
high_skew_cols

In [None]:
high_skew_cols

In [None]:
# Normalize skewed features
for i in high_skew_cols:
    all_data[i] = boxcox1p(all_data[i], boxcox_normmax(all_data[i] + 1))

In [None]:
all_data['OverallCond'].head()

In [None]:
# Create box plots for all numeric features
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale("log")
ax = sns.boxplot(data=all_data[plot_list] , orient="h", palette="Set1")
ax.xaxis.grid(False)
ax.set(ylabel="Feature names")
ax.set(xlabel="Numeric values")
ax.set(title="Numeric Distribution of Features")
sns.despine(trim=True, left=True)

# C. Feature Engineering - Creating New Features

In [None]:
# #Creating More Features

# all_data['BsmtFinType1_Unf'] = 1*(all_data['BsmtFinType1'] == 'Unf')
# all_data['HasWoodDeck'] = (all_data['WoodDeckSF'] == 0) * 1
# all_data['HasOpenPorch'] = (all_data['OpenPorchSF'] == 0) * 1
# all_data['HasEnclosedPorch'] = (all_data['EnclosedPorch'] == 0) * 1
# all_data['Has3SsnPorch'] = (all_data['3SsnPorch'] == 0) * 1
# all_data['HasScreenPorch'] = (all_data['ScreenPorch'] == 0) * 1
# all_data['YearsSinceRemodel'] = all_data['YrSold'].astype(int) - all_data['YearRemodAdd'].astype(int)
# all_data['Total_Home_Quality'] = all_data['OverallQual'] + all_data['OverallCond']
# all_data = all_data.drop(['Utilities', 'Street', 'PoolQC',], axis=1)
# all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
# all_data['YrBltAndRemod'] = all_data['YearBuilt'] + all_data['YearRemodAdd']

# all_data['Total_sqr_footage'] = (all_data['BsmtFinSF1'] + all_data['BsmtFinSF2'] +
#                                  all_data['1stFlrSF'] + all_data['2ndFlrSF'])
# all_data['Total_Bathrooms'] = (all_data['FullBath'] + (0.5 * all_data['HalfBath']) +
#                                all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath']))
# all_data['Total_porch_sf'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
#                               all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
#                               all_data['WoodDeckSF'])
# all_data['TotalBsmtSF'] = all_data['TotalBsmtSF'].apply(lambda x: np.exp(6) if x <= 0.0 else x)
# all_data['2ndFlrSF'] = all_data['2ndFlrSF'].apply(lambda x: np.exp(6.5) if x <= 0.0 else x)
# all_data['GarageArea'] = all_data['GarageArea'].apply(lambda x: np.exp(6) if x <= 0.0 else x)
# all_data['GarageCars'] = all_data['GarageCars'].apply(lambda x: 0 if x <= 0.0 else x)
# all_data['LotFrontage'] = all_data['LotFrontage'].apply(lambda x: np.exp(4.2) if x <= 0.0 else x)
# all_data['MasVnrArea'] = all_data['MasVnrArea'].apply(lambda x: np.exp(4) if x <= 0.0 else x)
# all_data['BsmtFinSF1'] = all_data['BsmtFinSF1'].apply(lambda x: np.exp(6.5) if x <= 0.0 else x)

# all_data['haspool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
# all_data['has2ndfloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
# all_data['hasgarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
# all_data['hasbsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
# all_data['hasfireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
def logs(res, ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(np.log(1.01+res[l])).values)   
        res.columns.values[m] = l + '_log'
        m += 1
    return res

log_features = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
                 'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                 'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
                 'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                 'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd']

all_data = logs(all_data, log_features)

In [None]:
all_data.shape

## Creating Dummy Variables

In [None]:
all_data = pd.get_dummies(all_data).reset_index(drop=True)
all_data.shape

In [None]:
# Remove any duplicated column names
all_data = all_data.loc[:,~all_data.columns.duplicated()]

In [None]:
all_data.shape

In [None]:
train_clean = all_data.iloc[:len(y), :]
test_clean = all_data.iloc[len(y):, :]
train_clean.shape, y.shape, test_clean.shape

# D. Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_clean, y, train_size=0.75, shuffle=True, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

Test Codes


In [None]:
train_check = train_clean.copy()

In [None]:
train_check['mean_sale_price'] = y_train.mean()

## Linear Regression Model

In [None]:
# Fit and Predict on X_test
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [None]:
print (f' Train Score is {lr.score(X_train, y_train)}')
print (f' Test Score is {lr.score(X_test, y_test)}')
mse = mean_squared_error(y_test, y_pred)
print (f' Mean squared error is {mse}')

Likely a case of overfitting as the model is performing better on Train Set but slightly less on test(validation) set

## Ridge Regression
L1 Regularization

In [None]:
alpha_ridge = [-3,-2,-1,1e-15, 1e-10, 1e-8,1e-5,1e-4, 1e-3,1e-2,0.5,1,1.5, 2,3,4, 5, 10, 20, 30, 40]

In [None]:
from sklearn.linear_model import Lasso 
temp_rss = {}
temp_mse = {}
for i in alpha_ridge:
    ## Assigin each model. 
    lasso_reg = Lasso(alpha= i, normalize=True)
    ## fit the model. 
    lasso_reg.fit(X_train, y_train)
    ## Predicting the target value based on "Test_x"
    y_pred = lasso_reg.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rss = sum((y_pred-y_test)**2)
    temp_mse[i] = mse
    temp_rss[i] = rss

In [None]:
for key, value in sorted(temp_mse.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

In [None]:
lasso_reg = Lasso(alpha=0.0001 , normalize=True)
## fit the model. 
lasso_reg.fit(X_train, y_train)
## Predicting the target value based on "Test_x"
y_pred = lasso_reg.predict(X_test)

In [None]:
print (f' Train Score is {lasso_reg.score(X_train, y_train)}')
print (f' Test Score is {lasso_reg.score(X_test, y_test)}')
mse = mean_squared_error(y_test, y_pred)
print (f' Mean squared error is {mse}')

## Ridge Regression
L2 Regularization

In [None]:
from sklearn.linear_model import Ridge 
temp_rss = {}
temp_mse = {}
for i in alpha_ridge:
    ## Assigin each model. 
    ridge_reg = Ridge(alpha= i, normalize=True)
    ## fit the model. 
    ridge_reg.fit(X_train, y_train)
    ## Predicting the target value based on "Test_x"
    y_pred = ridge_reg.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rss = sum((y_pred-y_test)**2)
    temp_mse[i] = mse
    temp_rss[i] = rss

In [None]:
for key, value in sorted(temp_mse.items(), key=lambda item: item[1]):
    print("%s: %s" % (key, value))

In [None]:
ridge_reg = Ridge(alpha=0.4 , normalize=True)
## fit the model. 
ridge_reg.fit(X_train, y_train)
## Predicting the target value based on "Test_x"
y_pred = ridge_reg.predict(X_test)

In [None]:
print (f' Train Score is {ridge_reg.score(X_train, y_train)}')
print (f' Test Score is {ridge_reg.score(X_test, y_test)}')
mse = mean_squared_error(y_test, y_pred)
print (f' Mean squared error is {mse}')

Other Models

In [None]:
kf = KFold(n_splits=12, random_state=42, shuffle=True)
# Define error metrics
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=train_clean):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return (rmse)

In [None]:
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

In [None]:
scores = {}

score = cv_rmse(lightgbm)
print("lightgbm: {:.4f} ({:.4f})".format(score.mean(), score.std()))
scores['lgb'] = (score.mean(), score.std())

In [None]:
lgb_model_full_data = lightgbm.fit(train_clean, y)
lgb_model_full_data.predict(test_clean)

Test Codes

In [None]:
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
# Step1: Create data set
# X, y = make_moons(n_samples=10000, noise=.5, random_state=0)
# Step2: Split the training test set
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 3: Fit a Decision Tree model as comparison

In [None]:
# Load Library
clf = DecisionTreeRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.score(X_test, y_test)

In [None]:
# Step 4: Fit a Random Forest model, " compared to "Decision Tree model, accuracy go up by 5%
clf = RandomForestRegressor(n_estimators=100, max_features="auto",random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.score(X_test, y_test)

In [None]:
# Step 5: Fit a AdaBoost model, " compared to "Decision Tree model, accuracy go up by 10%
clf = AdaBoostRegressor(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.score(X_test, y_test)

In [None]:
# Step 6: Fit a Gradient Boosting model, " compared to "Decision Tree model, accuracy go up by 10%
clf = GradientBoostingRegressor(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.score(X_test, y_test)

# Submission

In [None]:
submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission.shape

In [None]:
submission.iloc[:,1] = np.floor(np.expm1(clf.predict(test_clean)))

In [None]:
submission.to_csv("submission_GradientBoost.csv", index=False)