In [None]:
# Import helpful libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import ElasticNetCV

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x:'%.3f' % x)

sns.set_style('darkgrid')
sns.set(font_scale=1.3)

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

print(f'Train shape: {train.shape}')

train.head()

In [None]:
train.columns

## Analysing target

In [None]:
train.SalePrice.describe()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train.SalePrice, kde=True, alpha=0.9)

In [None]:
print(f'Skewness: {train.SalePrice.skew():.3f}')
print(f'Kurtosis: {train.SalePrice.kurtosis():.3f}')

A positive skewness value indicates an asymmetry in the distribution and the tail is larger  towards the right hand side of the distribution

kurtosis of normal distribution = 0

## Correlation analysis

In [None]:
plt.figure(figsize=(20, 20))
corr = train.corr()
sns.heatmap(corr, square=True, cmap='crest')

In [None]:
# Find most important features relative to target
corr.sort_values(['SalePrice'], ascending=False, inplace=True)
print(corr.SalePrice[:20])

## Missing data

In [None]:
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum() / train.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(19)

When more than 15% of the data is missing, we should delete the corresponding variable and pretend it never existed

In [None]:
train = train.drop((missing_data[missing_data['Total'] > 1]).index, 1)
train = train.drop(train.loc[train['Electrical'].isnull()].index)
train.isnull().sum().sum()

In [None]:
# Find most important features relative to target
corr_ = train.corr()

corr_.sort_values(['SalePrice'], ascending=False, inplace=True)
print(corr_.SalePrice[:20])

### distribution was positively correlated with the 'SalePrice'

In [None]:
fig, ax = plt.subplots(5, 4, figsize=(20, 24))

sns.histplot(train.SalePrice,       kde=True, alpha=0.9, ax=ax[0][0])
sns.countplot(train.OverallQual,        palette='crest', ax=ax[0][1])
sns.histplot(train.GrLivArea,       kde=True, alpha=0.9, ax=ax[0][2])
sns.countplot(train.GarageCars,         palette='crest', ax=ax[0][3])

sns.histplot(train.GarageArea,      kde=True, alpha=0.9, ax=ax[1][0])
sns.histplot(train.TotalBsmtSF,     kde=True, alpha=0.9, ax=ax[1][1])
sns.histplot(train['1stFlrSF'],     kde=True, alpha=0.9, ax=ax[1][2])
sns.countplot(train.FullBath,           palette='crest', ax=ax[1][3])

sns.countplot(train.TotRmsAbvGrd,       palette='crest', ax=ax[2][0])
sns.histplot(train.YearBuilt,       kde=True, alpha=0.9, ax=ax[2][1])
sns.histplot(train.YearRemodAdd,    kde=True, alpha=0.9, ax=ax[2][2]) 
sns.countplot(train.Fireplaces,         palette='crest', ax=ax[2][3])

sns.histplot(train.BsmtFinSF1,      kde=True, alpha=0.9, ax=ax[3][0])
sns.histplot(train.WoodDeckSF,      kde=True, alpha=0.9, ax=ax[3][1])
sns.histplot(train['2ndFlrSF'],     kde=True, alpha=0.9, ax=ax[3][2])
sns.histplot(train.OpenPorchSF,     kde=True, alpha=0.9, ax=ax[3][3])

sns.countplot(train.HalfBath,           palette='crest', ax=ax[4][0])
sns.histplot(train.LotArea,         kde=True, alpha=0.9, ax=ax[4][1])
sns.countplot(train.BsmtFullBath,       palette='crest', ax=ax[4][2])
sns.histplot(train.BsmtUnfSF,       kde=True, alpha=0.9, ax=ax[4][3])

### 'SalePrice' relationship with positively correlated numerical variables

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(26, 16))

sns.scatterplot(x='GrLivArea',    y='SalePrice', data=train, ax=ax[0][0])
sns.scatterplot(x='GarageArea',   y='SalePrice', data=train, ax=ax[0][1])
sns.scatterplot(x='TotalBsmtSF',  y='SalePrice', data=train, ax=ax[0][2])

sns.scatterplot(x='1stFlrSF',     y='SalePrice', data=train, ax=ax[1][0])
sns.scatterplot(x='YearBuilt',    y='SalePrice', data=train, ax=ax[1][1])
sns.scatterplot(x='YearRemodAdd', y='SalePrice', data=train, ax=ax[1][2])

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(26, 16))

sns.scatterplot(x='BsmtFinSF1',   y='SalePrice', data=train, ax=ax[0][0])
sns.scatterplot(x='WoodDeckSF',   y='SalePrice', data=train, ax=ax[0][1])
sns.scatterplot(x='2ndFlrSF',     y='SalePrice', data=train, ax=ax[0][2])

sns.scatterplot(x='OpenPorchSF',  y='SalePrice', data=train, ax=ax[1][0])
sns.scatterplot(x='LotArea',      y='SalePrice', data=train, ax=ax[1][1])
sns.scatterplot(x='BsmtUnfSF',    y='SalePrice', data=train, ax=ax[1][2])

### 'SalePrice' relationship with positively correlated categorical features

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(22, 12))

sns.boxplot(x='TotRmsAbvGrd', y='SalePrice', data=train, palette='crest', ax=ax[0][0])
sns.boxplot(x='Fireplaces',   y='SalePrice', data=train, palette='crest', ax=ax[0][1])
sns.boxplot(x='HalfBath',     y='SalePrice', data=train, palette='crest', ax=ax[0][2])

sns.boxplot(x='BsmtFullBath', y='SalePrice', data=train, palette='crest', ax=ax[1][0])
sns.boxplot(x='OverallQual',  y='SalePrice', data=train, palette='crest', ax=ax[1][1])
sns.boxplot(x='GarageCars',   y='SalePrice', data=train, palette='crest', ax=ax[1][2])

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='FullBath', y='SalePrice', data=train, palette='crest')

## Detect and remove the outliers

In [None]:
train.shape

In [None]:
train = train[train.GrLivArea < 4000]
train = train[train.TotalBsmtSF < 3500]
train = train[train['1stFlrSF'] <= 3000]
train = train[train.BsmtFinSF1 < 3000]
train = train[train.WoodDeckSF < 800]
train = train[train.OpenPorchSF <= 400]
train = train[train.LotArea < 150000]
train = train[train.SalePrice <= 500000]

train.shape

## Log transformation our skewed data

In [None]:
col = [
    'SalePrice', 'GrLivArea', 'GarageArea', 'TotalBsmtSF',
    '1stFlrSF', 'BsmtFinSF1', 'WoodDeckSF', '2ndFlrSF', 
    'OpenPorchSF', 'LotArea', 'BsmtUnfSF', 'YearBuilt',
    'YearRemodAdd', 'TotRmsAbvGrd', 'Fireplaces', 'HalfBath',
    'BsmtFullBath', 'OverallQual', 'GarageCars', 'FullBath'
    
]

train = train[col]

In [None]:
train.SalePrice = np.log(train.SalePrice)
train.GrLivArea = np.log(train.GrLivArea)
train['1stFlrSF'] = np.log(train['1stFlrSF'])
train.OpenPorchSF = np.log(train.OpenPorchSF + 1)
train.LotArea = np.log(train.LotArea)
train.Fireplaces = np.log(train.Fireplaces + 1)

#train.GarageArea = np.log(train.GarageArea)
#train.TotalBsmtSF = np.log(train.TotalBsmtSF)
#train.BsmtUnfSF = np.log(train.BsmtUnfSF)
#train.BsmtFinSF1 = np.log(train.BsmtFinSF1)
#train.WoodDeckSF = np.log(train.WoodDeckSF)
#train['2ndFlrSF'] = np.log(train['2ndFlrSF'])

train.skew()

### Convert categorical variable into dummy

In [None]:
train = pd.get_dummies(train)

## Modeling

In [None]:
y = train.SalePrice
X = train.drop(['SalePrice'], axis=1)

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [None]:
scorer = make_scorer(mean_squared_error, greater_is_better = False)

def rmse_cv_train(model):
    rmse= np.sqrt(-cross_val_score(model, train_X, train_y, scoring=scorer, cv=10))
    return(rmse)

def rmse_cv_test(model):
    rmse= np.sqrt(-cross_val_score(model, test_X, test_y, scoring=scorer, cv=10))
    return(rmse)

In [None]:
model = ElasticNetCV()
model.fit(train_X, train_y)

print("RMSE on Training set :", rmse_cv_train(model).mean())
print("RMSE on Test set :", rmse_cv_test(model).mean())

In [None]:
df = pd.DataFrame({
    'feature': X.columns,
    'coef': model.coef_,
})

plt.figure(figsize=(8, 6))

sns.barplot(y='feature', x='coef', data=df, palette='crest')
plt.xlabel('Importance')