# ✨House Price: Advanced Regression Techniques✨

   * Linear Assumption:
   Linear regression assumes that the relationship between your input and output is linear. It does not support anything else. This may be obvious, but it is good to remember when you have a lot of attributes. You may need to transform data to make the relationship linear (e.g. log transform for an exponential relationship).
   
   
   * Remove Noise: 
   Linear regression assumes that your input and output variables are not noisy. Consider using data cleaning operations that let you better expose and clarify the signal in your data. This is most important for the output variable and you want to remove outliers in the output variable (y) if possible.
   
   
   * Remove Collinearity:
   Linear regression will over-fit your data when you have highly correlated input variables. Consider calculating pairwise correlations for your input data and removing the most correlated.
   
   
   * Gaussian Distributions:
   Linear regression will make more reliable predictions if your input and output variables have a Gaussian distribution. You may get some benefit using transforms (e.g. log or BoxCox) on you variables to make their distribution more Gaussian looking.
   
   
   * Rescale Inputs:
   Linear regression will often make more reliable predictions if you rescale input variables using standardization or normalization.

### Importing Libreries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
plt.rc("font", size=15)
import warnings
warnings.simplefilter(action='ignore')

### Exploratory Data Analysis

In [None]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [None]:
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
df = pd.concat([train,test],axis=0)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#divide dataset into two parts(categorical, contineous)
categorical, numerical = [],[]
for z in df.columns:
    t = df.dtypes[z]
    if t=='object':
        categorical.append(z)
    else:
        numerical.append(z)
print("CategoricaL:\n{}".format(categorical))
print("\nNumericaL:\n{}".format(numerical))

In [None]:
print('Numerical Features:{}'.format(len(numerical)))
print('Categorical Features:{}'.format(len(categorical)))

In [None]:
year = [feature for feature in numerical if 'Mo' in feature or 'Yr' in feature or 'Year' in feature]
year

In [None]:
#finding the unique values in each column (type object)
for col in df.select_dtypes('O').columns:
    print('We have {} unique values in {} column : {}'.format(len(df[col].unique()),col,df[col].unique()))
    print('-'*100)

In [None]:
#print the count of unique values in each categorical columns
print('Categorical columns Unique values count\n')
for col in categorical:
    print(col,'-'*(30-len(col)),'>',len(df[col].unique()))

### Missing Values

In [None]:
plt.figure(figsize=(20,6));
sns.heatmap(df.isnull(),yticklabels=False, cbar=False, cmap='mako')

In [None]:
variable = [feature for feature in categorical if df[feature].isnull().sum()]
for feature in variable:
    print("{}: {}%".format(feature,np.round(df[feature].isnull().mean(),3)))

In [None]:
variable = [feature for feature in numerical if df[feature].isnull().sum()]
for feature in variable:
    print("{}: {}%".format(feature,np.round(df[feature].isnull().mean(),3)))

In [None]:
# Meaning that there is no Miscellaneous feature.
df.fillna({'Alley': 'None', 'Fence':'None', 'MiscFeature':'None', 
           'PoolQC':'None', 'FireplaceQu':'None', 'MasVnrType':'None'}, inplace = True)

#Meaning that there is no basement(Categorical).
df.fillna({'BsmtQual':'None', 'BsmtCond':'None',
           'BsmtExposure':'None', 'BsmtFinType1':'None',
           'BsmtFinType2':'None'},inplace=True)

#Missing Basement Columns(Numerical)
Bsmt_con = ['MasVnrArea','BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
            'TotalBsmtSF', 'BsmtFullBath','BsmtHalfBath','BsmtFinSF1', 
            'BsmtFinSF2', 'BsmtUnfSF']
for Bsmt in Bsmt_con:
    df[Bsmt].fillna(0, inplace=True) 
    
#missing Garage columns(Categorical)
df.fillna({'GarageType':'None','GarageCond': 'None', 'GarageQual':'None', 
           'GarageQual':'None', 'GarageFinish': 'None'}, inplace=True)

#Missing Garage Columns(Numerical)
df.fillna({'GarageCars':0, 'GarageArea': 0}, inplace = True)

#Replacing Other categorical variable with its mode
df['MSZoning']=df['MSZoning'].fillna(df['MSZoning'].mode()[0])
df['Electrical']=df['Electrical'].fillna(df['Electrical'].mode()[0])
df['Functional']=df['Functional'].fillna(df['Functional'].mode()[0])
df['KitchenQual']=df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])
df['SaleType']=df['SaleType'].fillna(df['SaleType'].mode()[0])
df['Utilities']=df['Utilities'].fillna(df['Utilities'].mode()[0])
df['LotFrontage']=df['LotFrontage'].fillna(df['LotFrontage'].mean())
df['GarageYrBlt']=df['GarageYrBlt'].fillna(df['GarageYrBlt'].median())


df['Exterior1st'].fillna('Other' ,inplace=True)
df['Exterior2nd'].fillna('Other' ,inplace=True)

### Correlation

In [None]:
corr =df.corr()
corr.sort_values(['SalePrice'], ascending= False, inplace=True)
print(corr.SalePrice)

In [None]:
# most correlated features
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["SalePrice"])>0.5]
plt.figure(figsize=(14,8))
g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap="cubehelix")

###### Categories in Number

In [None]:
categorical_num = [feature for feature in numerical if len(df[feature].unique())<20 and feature not in year+['Id']]

###### Contineous Variable

In [None]:
contineous = [feature for feature in numerical if len(df[feature]) and feature not in year+['Id']+categorical_num]

In [None]:
plt.figure(figsize=(25, 15))
heatmap =sns.heatmap(df[contineous].corr(), annot = True,  cmap="crest")
heatmap.set_title('Correlation Heatmap');

### Visualization

In [None]:
year_features = ['GarageYrBlt', 'YearBuilt', 'YearRemodAdd', 'YrSold']

plt.figure(figsize=(15, 8))
sns.set(font_scale= 1.2)
sns.set_style('whitegrid')

for i, features in enumerate(year_features):
    plt.subplot(2, 2, i+1)
    plt.scatter(data=train, x=features, y='SalePrice', color ="maroon")  
    plt.xlabel(features)
    plt.ylabel('SalePrice')
    
sns.despine()

In [None]:
plt.figure(figsize=(14, 8))
sns.kdeplot(data=train,x="SalePrice", hue ="MoSold", fill=True,common_norm=False, palette="husl", alpha=.5, linewidth=1)

###### Attribute

In [None]:
Quality_features = [ 'RoofMatl', 'ExterQual', 'BsmtQual', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'GarageQual']

plt.figure(figsize=(30, 20))
sns.set(font_scale= 1.2)
sns.set_style('darkgrid')

for i, feature in enumerate(Quality_features):
    plt.subplot(3, 4, i+1)
    sns.barplot(data=train, x=feature, y='SalePrice', palette="ch:.10")  
    
    
sns.despine()

### Outliers

In [None]:
# We shall plot these figures
plt.figure(figsize=(30, 70))
sns.set(font_scale= 1.2)
sns.set_style('whitegrid')

for i, features in enumerate(numerical):
    plt.subplot(10, 4, i+1)
    plt.scatter(data=df.iloc[:len(train)], x=features, y='SalePrice', color ="blue")
    plt.xlabel(features)
    plt.ylabel('SalePrice')
    
    
sns.despine()

from above graphs we can see the varibles LotFrontage, LotArea, MasVnrArea, BsmtFinsf1, TotalBsmtsf, 
1stFlrsf, 2ndFlrsf, GrLivArea, GrageArea this variables have high outliers. means extremely large areas for very low prices. so we replace these outliers by its lower values.

In [None]:
df.LotFrontage[(df.LotFrontage >= 160)] = 160
df.LotArea[(df.LotArea >= 75000)] = 75000
df.MasVnrArea[(df.MasVnrArea >= 1000)] = 1000
df.BsmtFinSF1[(df.BsmtFinSF1 >= 2500)] = 2500
df.TotalBsmtSF[(df.TotalBsmtSF >= 3000)] = 3000
df['1stFlrSF'][(df['1stFlrSF'] >= 3000)] = 3000
df.GrLivArea[(df.GrLivArea >= 3500)] = 3500
df.GarageArea[(df.GarageArea >= 1500)] = 1500

#### Data Transaformation

In [None]:
plt.figure(figsize=(10, 5))
from scipy import stats
from scipy.stats import norm, skew
sns.distplot(df.iloc[:len(train)]['SalePrice'] , fit=norm, color='maroon');
(mu, sigma) = norm.fit(df.iloc[:len(train)]['SalePrice'])
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')

In [None]:
plt.figure(figsize=(10, 5))
df['SalePrice'] = np.log1p(df.iloc[:len(train)]['SalePrice'])
sns.distplot(df.iloc[:len(train)]['SalePrice'] , fit=norm, color='maroon')
(mu, sigma) = norm.fit(df.iloc[:len(train)]['SalePrice'])
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')

### Skewness

In [None]:
#if it is more than 1 or -1 it is highly skewed and between 0.5 and 1 it is moderatly skewed, between 0.5 and 0 it is almost symmetric
skewed_clm = df[contineous].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_clm= skewed_clm[skewed_clm > 0.75]
skewed_clm= skewed_clm.index

df[skewed_clm] = np.log1p(df[skewed_clm])

### Creating Dummy Variables

In [None]:
df =pd.get_dummies(df, columns=categorical, drop_first=True)

In [None]:
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

In [None]:
del df['TotalBsmtSF']

In [None]:
del df['1stFlrSF']

In [None]:
del df['2ndFlrSF']

### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
new_train = df.iloc[:1460,:]
new_test = df.iloc[1460:,:]

In [None]:
x = new_train.drop(['SalePrice'], axis=1)
y = new_train['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42) # 75% training and 25% test

### Model Fitting

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [None]:
def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)

**✔Simple linear regression:**
is a statistical method that allowa us to summarize and study relationship between two contineous 
variables i.e 'predictore' and 'response' variables.


**✔Multiple Linear Regression:**
multiple linear regression is an extension of simple linear regression. it is used when we want to 
predict more than one 'predictore' variable.

**✔overfitting:**

When we have a lot of measurements, we can be confident that the least squares line accurately reflects the relationship. But if we have a few traing dataset we fit new lines which overlaps dataset and sum of squared residuals is 0 or it can be too small. and after that if we test same fitted model to testing dataset then sum fo squared residuals will large. 
 Hence we say that the line or model overfit the traing data and have a high varince. Hence the main idea behind ridge regression is to find the new line that does't fit traing data as well
 

**✔Regularization:**

it is a technique used to reduced the error at the cost of introducing some bias to the given  training set and avoid overfitting. commonly used regularisation techniques are:
1. L1 regularisation
2. L2 regularisation


1. L1 Regularisation: is also called LASSO(Least Absolute Shrinkage and Selection Operatore)Regression
2. L2 Regularisation: is also called Ridge Regression.


**✔Ridge Regression:**

Ridge Regression is a model tuning method that is used to analyse any data that suffers from multicollinearity (multicollinearity is simply correlation between predictore variables or independant variables).
when the issue of multicollinearity occurs, least-squares are unbaised, and variance are large, this results in predicted values to be far away from the actual values. In Ridge Regression we have the small amount of bias due to the penalty has less variance.
The bias added to the model is known as the Ridge Regression Penalty. we compute it by multiplying lamda by the square of each frature. The equation for the Ridge Regression Penalty is:
               
                  The sum of the squared residuals + lamda1*(slope)^2           
1. lamda - determine how severe penalty is.
2. slope - penalty for traditional least square method.
       
In Ridge Regression smaller the slope means ridge regression line are less sensitive to dependant variable that is least square line.
 when lamda is 0 then:               
                   
                   The sum of the squared residuals + 0*(slope)^2
                   
which is the least square line.
                    
                    

**✔Lasso Regression:**

Lasso Regression is almost identical to Ridge Regression, the only difference being that we take the absolute value as opposed to the squaring the features when computing the ridge regression penalty.
The equation for the Ridge Regression Penalty is:                  
                 
                    The sum of squared residuals + lamda2*|Slope|

As a result of taking the absolute value, Lasso Regression can shrink the slope all the way down to 0. whereas Ridge Regression can only shrink the slope asymptotically close to 0.



**✔Elastic-Net Regression:**
Elastic-Net Regression combines penalty for lasso and ridge regression.It also groups and shrinks the parameters associated with correlated variables and leaves them in the equation or removes them all at once.
Equation for Elastic-Net:
                    
                    lamda1*|var1|+------|varx| + lamda2*(var1)^2+-------(varx)^2

1. lamda1 > 0 and lamda2 = 0 then we get lasso regression or vise versa.
2. lamda1 >0 and lamda2 > 0 then we will get Elastic_net Regression.
     


**✔Ridge Regression vs. Linear Regression:**

In simple linear regression, we determine the best fitting line by minimizing the sum of the squared residuals. but in Ridge Regression we minimiz sum of squred of residuals as well as Ridge Regression Penalty.

Simple Regression doesn’t differentiate “important” from “less-important” predictors in a model, so it includes all of them. This leads to overfitting a model and failure to find unique solutions. 

linear regression produces unbiased estimates, variances can be so large that they may be wholly inaccurate. Ridge regression adds just enough bias to make the estimates reasonably reliable approximations to true population values.

#### Ridge Regression

In [None]:
ridge = RidgeCV(alphas = [1, 0.1, 0.001, 0.0005])
ridge.fit(X_train, y_train)
pred = ridge.predict(X_test)

test_pred = ridge.predict(X_test)
train_pred = ridge.predict(X_train)

print('Test set evaluation:\n')
print_evaluate(y_test, test_pred)
print('*'*30)
print('Train set evaluation:\n')
print_evaluate(y_train, train_pred)

In [None]:
from yellowbrick.regressor import PredictionError
vis = PredictionError(ridge)
vis.fit(X_train, y_train)
vis.score(X_train, y_train)
vis.show()

In [None]:
from yellowbrick.regressor import ResidualsPlot
vis = ResidualsPlot(ridge)
vis.fit(X_train, y_train)
vis.score(X_train, y_train)
vis.show()

#### Lasso Regression

In [None]:
lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005])
lasso.fit(X_train, y_train)

test_pred = lasso.predict(X_test)
train_pred = lasso.predict(X_train)

print('Test set evaluation:\n')
print_evaluate(y_test, test_pred)
print('*'*30)
print('Train set evaluation:\n')
print_evaluate(y_train, train_pred)

In [None]:
from yellowbrick.regressor import PredictionError
vis = PredictionError(lasso)
vis.fit(X_train, y_train)
vis.score(X_train, y_train)
vis.show()

In [None]:
from yellowbrick.regressor import ResidualsPlot
vis = ResidualsPlot(lasso)
vis.fit(X_train, y_train)
vis.score(X_train, y_train)
vis.show()

#### Elastic-Net Regression

In [None]:
E_model = ElasticNetCV(alphas = [1, 0.1, 0.001, 0.0005])
E_model.fit(X_train, y_train)

test_pred = E_model.predict(X_test)
train_pred = E_model.predict(X_train)

print('Test set evaluation:\n')
print_evaluate(y_test, test_pred)
print('*'*30)
print('Train set evaluation:\n')
print_evaluate(y_train, train_pred)

In [None]:
from yellowbrick.regressor import PredictionError
vis = PredictionError(E_model)
vis.fit(X_train, y_train)
vis.score(X_train, y_train)
vis.show()

In [None]:
from yellowbrick.regressor import ResidualsPlot
vis = ResidualsPlot(E_model)
vis.fit(X_train, y_train)
vis.score(X_train, y_train)
vis.show()

In [None]:
lasso.fit(X_train, y_train)
train_pred = np.expm1(lasso.predict(X_train))
pred = np.expm1(lasso.predict(X_test))
print('Test set evaluation:\n')
print_evaluate(np.expm1(y_test), pred)
print('*'*30)
print('Train set evaluation:\n')
print_evaluate(np.expm1(y_train),train_pred)

In [None]:
final_test=new_test.copy()

In [None]:
X = new_test.drop(['SalePrice'], axis=1)
Y = new_test[['SalePrice']]

In [None]:
final_test['SalePrice'] = np.expm1(lasso.predict(X))
final_test['Id'] = new_test['Id']

logistic_submission = final_test[['Id','SalePrice']]

logistic_submission.to_csv("submission.csv", index=False)

logistic_submission.tail()

In [None]:
logistic_submission.head(50)