In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### **Loading and Understanding the Data**

In [None]:
data = pd.read_csv('/kaggle/input/ames-housing-dataset/AmesHousing.csv')

In [None]:
data = data.sort_values("Yr Sold")

In [None]:
pd.options.display.max_columns = None

In [None]:
data.head()

#### **Getting Information from Data**

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.select_dtypes(object).columns

In [None]:
data.select_dtypes([np.int64, np.float64]).columns

##### **Checking Duplicates if Any**

In [None]:
data.duplicated().sum()

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize=(12,6))
# sns.histplot(data['SalePrice'])
sns.displot(data['SalePrice'], height=7, aspect=1.7, color='brown')
plt.title('SalePrice Distribution')

plt.show()

In [None]:
data.columns

## **Handling Missing Values**

#### **1 - Dropping Features having Null values greater than 5% of dataset size**

In [None]:
null_count = data.isnull().sum()
dlt = null_count[null_count > data.shape[0]*0.05].index
dlt

In [None]:
data.drop(dlt, axis=1, inplace=True)

#### **2 - Handling Numerical Features**

**2.1-Checking Numerical Features with Null Values**

In [None]:
num_null = data.select_dtypes([np.int64, np.float64]).isnull().sum()
missing_num = num_null[num_null>0].index
missing_num

**2.2-Finding out the Mode for each Feature**

In [None]:
fill = data[missing_num].mode().to_dict(orient = "record")[0]
fill

**2.3-Filling them with Mode**

In [None]:
data.fillna(fill, inplace = True)

#### **3 - Handling Non-Numerical (Object) Features**

In [None]:
obj = data.select_dtypes(object)
obj.head()

#### **Dropping Columns Having Unique Values Greater than 10**

**Finding out Uniques Values for each column**

In [None]:
uniq = obj.apply(lambda col: len(col.unique())).sort_values(ascending = False)
uniq

**Removing Columns with Unique Values greater than 10**

In [None]:
rmv_uniq = uniq[uniq>10].index
data.drop(rmv_uniq, axis = 1, inplace = True)

In [None]:
obj_col = data.select_dtypes(object).columns
obj_col

### **Data Cleaning**

- We are more interested in **'Years Before Sale'** and **'Years Since Remodelled**' of a car rather than **'Year Built'**, **'Yr Sold'** or **'Year Remod/Add'**.
- Because before purchasing we are more concerned about **'how much time has passed after remodelling of car'** or **'how many years the car has passed before it was sold'**. So for that we are transforming into our required form.

In [None]:
years_sold = data['Yr Sold'] - data['Year Built']
years_sold [years_sold<0]

'Year Built' will always be smaller than 'Yr Sold', which means 2180 row has wrong values. So we'll have to drop it.

In [None]:
years_rmd = data['Yr Sold'] - data['Year Remod/Add']
years_rmd[years_rmd<0]

These three rows must be dropped b/c they should not be negative.

In [None]:
data.drop([1702, 2180,2181], axis = 0,inplace = True)

**Saving into New Column**

In [None]:
data["Years Before Sale"] = years_sold
data["Years Since Remod"] = years_rmd

**Removing Old Features**

In [None]:
data.drop(['Year Built','Year Remod/Add'], axis = 1, inplace = True)

### **Removing other Unnecessary Features**

**'Order' is just representing the row order, we don't need it in our dataset**

In [None]:
data.drop(["Order"], axis = 1, inplace = True)

**We can also see straight away some leaking feature i.e. we have some columns which are leaking our target values. Which may result in bad prediction because of the seasonality present in them. Since we are looking towards modelling a general price prediction model, so we better drop these features**

In [None]:
data_leak = ['Mo Sold', 'Yr Sold', 'Sale Type','Sale Condition']
data.drop(data_leak,axis = 1, inplace = True)

In [None]:
c = data.corr()
plt.figure(figsize=(15,10))
# sns.set(font_scale = 1)
sns.heatmap(c)

### **Converting Object Feature into Numerical Form**

In [None]:
obj_col = data.select_dtypes(object).columns
obj_col

In [None]:
data[obj_col] = data[obj_col].astype('category')

In [None]:
for i in obj_col:
    data[i] = data[i].cat.codes

#### **Plotting Correlation wrt to SalePrice (Target Label)**

In [None]:
plt.figure(figsize=(4,30))

cor_df = pd.DataFrame({'SalePrice' : data.corr()['SalePrice'].values},
                     index = data.corr()['SalePrice'].index)

sns.heatmap(cor_df, annot=True, cmap='viridis', annot_kws={"fontsize":17})
sns.set(font_scale = 1.5)

plt.show()

**We've decided to take the features having correlation with SalePrice > 0.25**

In [None]:
cor = data.corr()["SalePrice"].abs().sort_values(ascending = False)
retained = cor[cor>0.25].index
retained

In [None]:
data = data[retained]

### **Checking Multi-Collinearity**

In [None]:
s = c.unstack()
so = s.sort_values(kind="quicksort")

**Below are the features' pair having correlation greater than 0.78**

In [None]:
fin = so[(so > 0.78) & (so < 1)].sort_values(ascending=False)
fin

**We have to remove any one feature from each pair.**

In [None]:
f_l = list(fin.index)
f_l

We will be deleting the following features from our dataset.

In [None]:
lst_del = []
for index, value in enumerate(f_l):
    if index%2 != 0:
        lst = value[0]
        lst_del.append(lst)
lst_del

In [None]:
data.drop(['Garage Cars', 'TotRms AbvGrd', '1st Flr SF'], axis=1, inplace=True)

### **Checking Variance**

**We'll be dropping features where there is no or very little variation, b/c these features are of no use for our model. Therefore we'll be keeping features only with variance greater than 0.01**

In [None]:
hetro = data.copy()
hetro = (hetro-hetro.min())/(hetro.max()- hetro.min())
var = hetro.var().sort_values(ascending = False)
var

In [None]:
final_col = var[var>0.01].index
final_col

In [None]:
data = data[final_col]

In [None]:
data.head()

In [None]:
data.columns

### **Standardizing the Data (Normalization)**

**Since we cannot perform standardization to our target label ('SalePrice'). Therefore for the time being we are saving it into a variable, then we'll replace it with the original one**

In [None]:
sale_price = data["SalePrice"]

In [None]:
data = (data-data.min())/(data.max()-data.min())

In [None]:
data["SalePrice"] = sale_price

In [None]:
data.head()

**Shuffling the DataSet before splitting**

In [None]:
data = data.sample(frac=1, random_state=123)
data.head()

In [None]:
data.shape

**Splitting the Data into 75% and 25%**

In [None]:
indx = int(2927*0.75)

train = data[:indx]
test = data[indx:]

In [None]:
features = train.columns.drop('SalePrice')
target = ['SalePrice']

#### **Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error


model = LinearRegression()
model.fit(train[features], train[target])

prediction = model.predict(test[features])

mae = mean_absolute_error(test[target], prediction)

print('Linear Regression')
print(f'Mean Absolute Error: {mae}')



### **K-Fold Validation**

**Since the size of our dataset is not too big, therefore we should apply k-fold validation method**

In [None]:
from sklearn.model_selection import KFold

model = LinearRegression()
maes = []

kf = KFold(5, True, random_state=123)

for train_index, test_index in kf.split(data):
    train = data.iloc[train_index]
    test = data.iloc[test_index]
    model.fit(train[features], train[target])
    prediction = model.predict(test[features])
    mae = mean_absolute_error(prediction, test[target])
    maes.append(mae)

print(f'Mean Absolute Error: {np.mean(maes)}')

### **Model Comparision with Default Parameters**

**Now we will be comparing different regression models with their default parameters along with k-fold validation.**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

LG = LinearRegression()
SV = SVR()
KN = KNeighborsRegressor()
DT = DecisionTreeRegressor(random_state=123)
GB = GradientBoostingRegressor(random_state=123)
RF = RandomForestRegressor(random_state=123)

models = [LG, SV, KN, DT, GB, RF,]
model_name = [ 'Linear Regression', 'Support Vector Regression', 'K Nearest Neighbor', 
              'Decision Tree', 'Gradient Boost', 'Random Forest' ]


means = []
r2_score_ = []
kf = KFold(5, True, random_state=123)

for i in range(len(models)):
    maes = []
    r2s = []
    model = models[i]

    for train_index, test_index in kf.split(data):
        train = data.iloc[train_index]
        test = data.iloc[test_index]
        model.fit(train[features], train[target])
        prediction = model.predict(test[features])
        mae = mean_absolute_error(prediction, test[target])
        r2 = r2_score(test[target], prediction)
        maes.append(mae)
        r2s.append(r2)
        
    means.append(np.mean(maes))
    r2_score_.append(np.mean(r2s))
    
    
mod_comp_def = pd.DataFrame({'Models' : model_name, 'Mean Absolute Error' : means,
                            'R2_Score' : r2_score_}).set_index('Models')
mod_comp_def

## **Hyperparameter Tuning**

### **- Using GridSearchCV**

Here we will be using GridSearchCV of SciKit Learn Library to find out the better parameter values for respective models which give us the optimum result.

Below we are defining various combinations

In [None]:
# #LG = LinearRegression
# RV = 
# KN = K Nearest Neighbor
# DT = DecisionTree
# GB = GradientBoost
# RF = RandomForest

parameter_space_LG = {
    'fit_intercept' : [True, False] ,
    'normalize' : [True, False] ,
    'copy_X' : [True, False] ,
    'positive' : [True, False]
}

parameter_space_SV = {
    "kernel": ["poly", "linear", "rbf", "sigmoid"],
        "degree": [3, 5],
        "coef0": [0, 3, 7],
        "gamma":[1e-3, 1e-1, 1/train[features].shape[1]],
        "C": [1, 10, 100],
}

parameter_space_RI = {
    "alpha": [1, 10, 100, 290, 500],
    "fit_intercept": [True, False],
    "solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    'normalize': [True, False],
    'copy_X' : [True, False],
    'max_iter' : [10, 100, 500, 1000]
}

parameter_space_EN = {
    'alpha' : [300, 500,1000,1500] ,
    'l1_ratio' : [0.1, 0.5, 1] ,
    'fit_intercept' : [True, False] ,
    'normalize' : [True, False] ,
    'max_iter' : [10, 100, 500, 1000],
    'selection' : ['cyclic', 'random'],
}

parameter_space_KN = {
    'n_neighbors' : [1,5,10,20,30,40,50],
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [1,2,20,50,200],
    'p' : [1,2],
}

parameter_space_DT = {
    'criterion' : ['mse', 'friedman_mse', 'mae', 'poisson'] ,
    'splitter' : ['best', 'random'],
    'max_depth' : [5,10,20,50],
}

parameter_space_GB = {
    'loss' : ['ls', 'lad', 'huber', 'quantile'],
    'learning_rate' : [0.1,0.2, 0.5],
    'n_estimators' : [180, 200,300],
    'criterion' : ['friedman_mse', 'mse', 'mae'],
}

parameter_space_RF = {
    'n_estimators' : [100,120],
    'criterion' : ['mse', 'mae'],
    'max_depth' : [10,15,30],
}

##### **Applying these combination in our model using GridsearchCV**

In [None]:
from sklearn.model_selection import GridSearchCV


LG = LinearRegression()
SV = SVR()
KN = KNeighborsRegressor()
DT = DecisionTreeRegressor(random_state=123)
GB = GradientBoostingRegressor(random_state=123)
RF = RandomForestRegressor(random_state=123)

models = [LG, SV, KN, DT, GB, RF,]
model_name = [ 'Linear Regression', 'Support Vector Regression', 'K Nearest Neighbor', 
              'Decision Tree', 'Gradient Boost', 'Random Forest' ]
parameter_space = [parameter_space_LG, parameter_space_SV, parameter_space_KN, 
                  parameter_space_DT, parameter_space_GB, parameter_space_RF]

for i in range(6):
    clf = GridSearchCV(models[i],parameter_space[i] , n_jobs=4,
                   cv=None, scoring="neg_mean_absolute_error")

    clf.fit(train[features], train[target])
    print(f'{model_name[i]}:')
    print("Best parameters:")
    print(clf.best_params_)
    print('')

After performing Grid Search to our parameters combinations, we can conclude that:
- Linear Regression will have highest accuracy with parameters: {'copy_X': True, 'fit_intercept': True, 'normalize': True, 'positive': False}
- Support Vector Regression will have its highest accuracy among the given combination with parameters taken as: {'C': 100, 'coef0': 7, 'degree': 5, 'gamma': 0.1, 'kernel': 'poly'}
- K Nearest Neighbor's optimum parameterss are: {'algorithm': 'ball_tree', 'leaf_size': 200, 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
- Decision Tree has following optimum parameter values: {'criterion': 'mae', 'max_depth': 5, 'splitter': 'best'}
- For Gradient Boost to perform best, the parameters will be: {'criterion': 'mse', 'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 200}
- Random Forest will be having its best performance with parameters: {'criterion': 'mae', 'max_depth': 15, 'n_estimators': 100}

In [None]:
LG_ = LinearRegression(copy_X=True, fit_intercept=True, normalize=True, positive=False, )
SV_ = SVR(C=100, coef0=7, degree=5, gamma=0.1, kernel='poly', )
KN_ = KNeighborsRegressor(algorithm='ball_tree', leaf_size=200,
                         n_neighbors=10, p=1, weights='distance')
DT_ = DecisionTreeRegressor(criterion='mae', max_depth=5, splitter='best', random_state=123)
GB_ = GradientBoostingRegressor(criterion='mse', learning_rate=0.1, 
                                loss='huber', n_estimators=200, random_state=123)
RF_ = RandomForestRegressor(criterion='mae', max_depth=15, n_estimators=100, random_state=123)

models = [LG_, SV_, KN_, DT_, GB_, RF_,]
model_name = [ 'Linear Regression', 'Support Vector Regression', 'K Nearest Neighbor', 
              'Decision Tree', 'Gradient Boost', 'Random Forest' ]


means = []
r2_score_ = []
kf = KFold(5, True, random_state=123)

for i in range(len(models)):
    maes = []
    r2s = []
    model = models[i]

    for train_index, test_index in kf.split(data):
        train = data.iloc[train_index]
        test = data.iloc[test_index]
        model.fit(train[features], train[target])
        prediction = model.predict(test[features])
        mae = mean_absolute_error(prediction, test[target])
        r2 = r2_score(test[target], prediction)
        maes.append(mae)
        r2s.append(r2)
        
    means.append(np.mean(maes))
    r2_score_.append(np.mean(r2s))
    
    
mod_comp = pd.DataFrame({'Models' : model_name, 'Mean Absolute Error' : means,
                            'R2_Score' : r2_score_}).set_index('Models')
mod_comp

##### **After hyperparameter tuning we can see that:**
- Performance of each model except Linear Regression has been improved.
- SVR performance has improved alot.
- Among all models that we have used in this ML, Gradient Boost has performed the best with minimum MAE and highest r2_Score.

In [None]:
# MAES with default parameters
mod_comp_def

In [None]:
mod_comp = mod_comp.sort_values('Mean Absolute Error')
mod_comp_def = mod_comp_def.sort_values('Mean Absolute Error')

In [None]:
mod_comp_def

##### **Merging Both Results**

In [None]:
final = mod_comp.copy()
final['Mean Absolute Error_Before'] = mod_comp_def['Mean Absolute Error']
final['R2_Score_Before'] = mod_comp_def['R2_Score']
final.reset_index(inplace=True)
final

#### **Mean Absolute Error Comparison Before and After Hyperparameter Tuning**

In [None]:
sns.set(font_scale=1.5)
my_ticks = ['Gradient Boost', 'Random Forest', 'SVR', 'KNN', 'Linear Reg.', 'Decision Tree']

mylegends = ['Mean Absolute Error_After', 'Mean Absolute Error_Before']
ax = final[['Mean Absolute Error', 'Mean Absolute Error_Before']].plot.bar(figsize=(15,9), 
                                                                           color = ['SteelBlue', 'SeaGreen'])
ax = final['Mean Absolute Error'].plot(ls='--', lw=3, marker='o', color='SteelBlue')
ax = final['Mean Absolute Error_Before'].plot(ls='-.', lw=3, marker='o', color='SeaGreen')
ax.set_xticklabels(my_ticks)
ax.legend(title='MAE', labels=mylegends)

plt.xticks(rotation=-30)

plt.show()

#### **r2_Score Comparison Before and After Hyperparmeter Tuning.**

In [None]:
sns.set(font_scale=1.5)
my_ticks = ['Gradient Boost', 'Random Forest', 'SVR', 'KNN', 'Linear Reg.', 'Decision Tree']

mylegends = ['R2_Score_After', 'R2_Score_Before']
ax = final[['R2_Score', 'R2_Score_Before']].plot.bar(figsize=(15,12), color = ['LightSalmon', 'Teal'])
ax = final['R2_Score'].plot(ls='--', lw=3, marker='o', color = 'DarkSalmon')
ax = final['R2_Score_Before'].plot(ls='-.', lw=3, marker='o', color='Teal')
ax.set_xticklabels(my_ticks)
ax.legend(title='R2_Score', labels=mylegends)

plt.xticks(rotation=-30)

plt.show()

#### **As you can see that after parameters' hypertuning Gradient Boost performs best among other regression models, having lowest Mean Absolute Error and Highest r2_Score**