# Importing Libraries

In [None]:
#importing the necessary libraries

import warnings
import statsmodels.api as sm
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LinearRegression, RANSACRegressor, TheilSenRegressor, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from statsmodels.distributions.empirical_distribution import ECDF
from statsmodels.graphics.tsaplots import plot_acf

In [None]:
df = pd.read_csv("../input/usa-housing/USA_Housing.csv")

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
rounder = lambda a: [round(x,2) for x in a]

In [None]:
df["Price"] = rounder(df.Price)

In [None]:
df.drop("Address", axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df = df.sample(frac = 1).reset_index().drop("index",axis=1) #sampling the data

In [None]:
sns.pairplot(df)

In [None]:
col = df.columns

# Cumulative Distribution Plot

In [None]:
#plotting the cumulative distribution of the data

plt.figure(figsize=(14,10))
plt.subplot(321)
ecdf_1 = ECDF(df[col[0]])
plt.title(col[0] + " CDF")
plt.plot(ecdf_1.x, ecdf_1.y)
plt.subplot(322)
ecdf_2 = ECDF(df[col[1]])
plt.title(col[1] + " CDF")
plt.plot(ecdf_2.x, ecdf_2.y)
plt.subplot(323)
ecdf_3 = ECDF(df[col[2]])
plt.title(col[2] + " CDF")
plt.plot(ecdf_3.x, ecdf_3.y)
plt.subplot(324)
ecdf_4 = ECDF(df[col[3]])
plt.title(col[3] + " CDF")
plt.plot(ecdf_4.x, ecdf_4.y)
plt.subplot(325)
ecdf_5 = ECDF(df[col[4]])
plt.title(col[4] + " CDF")
plt.plot(ecdf_5.x, ecdf_5.y)
plt.subplot(326)
ecdf_6 = ECDF(df[col[5]])
plt.title(col[5] + " CDF")
plt.plot(ecdf_6.x, ecdf_6.y)
plt.tight_layout()

# Outlier Analysis

In [None]:
fig, ax = plt.subplots(3,2, figsize=(12,8))
axes_ = [axes_row for axes in ax for axes_row in axes]

for i, j in enumerate(df.columns):
    g = sns.boxplot(x = df[j], ax = axes_[i])
    g.set_title(j)
    plt.tight_layout()

Here we can some potential outliers present in our data except the Avg. Area Number of Bedrooms. Now we have to decide whether we should ignore this outliers or let it be in our model so before going into any conclusion, we will first investigate these extreme values first to make any conclusion.

In [None]:
df.describe(percentiles = [0.001,0.01, 0.1, 0.25, 0.50, 0.75, 0.95, 0.99])

In [None]:
def check_outliers(data, col):
    
    #calculating quantiles
    q25 = data[col].quantile(q = 0.25)
    q75 = data[col].quantile(q = 0.75)
    
    #calculating inter quantile range
    iqr = q75-q25
    
    #calculating upper_bound and lower_bound
    lower_bound = q25 - (1.5*iqr)
    upper_bound = q75 + (1.5*iqr)
    
    #filtering out outliers....!!!!!
    outliers = data[col][(data[col] < lower_bound) | (data[col] > upper_bound)]
    
    print("**** Printing Outliers Result ****")
    
    print("\nTotal Outliers Present in the Data: %s"%(len(outliers)))
    
    #plotting the line plot result
    plt.figure(figsize=(10,10))
    plt.subplot(211)
    plt.plot(data[col])
    plt.title(col + " with Outliers")
    plt.scatter(x = outliers.index, y = outliers.values, marker = "X", color = 'r', s = 100)
    
    #plotting the box plot result
    plt.subplot(212)
    plt.title(col + " After Removing Extreme Values")
    filter_data = data[col][~(data[col].isin(outliers))]    
    sns.boxplot(filter_data)

In [None]:
check_outliers(df, col[5])

After removing the outliers we can see our data become more stables so what we can do instead or removing or imputing we will apply technique called winsorization in which we cap the lower and upper extreme values using quantiles.

In [None]:
from scipy.stats.mstats import winsorize

In [None]:
df_copy = df.copy()

### Winsorization

In [None]:
def apply_winsorize(data, col):
    winsorize(data[col], limits = [0.005, 0.005], inplace = True)

In [None]:
#performing winsorization using scipy to remove the effect of extreme values

for i, j in df.items():
    apply_winsorize(df_copy, i)

In [None]:
fig, ax = plt.subplots(3,2, figsize=(12,8))
axes_ = [axes_row for axes in ax for axes_row in axes]

for i, j in enumerate(df.columns):
    g = sns.boxplot(x = df_copy[j], ax = axes_[i])
    g.set_title(j)
    plt.tight_layout()

In [None]:
X = df_copy.iloc[:, :-1]
y = df_copy.iloc[:, -1]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
sns.distplot(y)

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df_copy.corr(), annot = True, fmt = '.2f', cmap = 'viridis')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Sklearn Pipeline

In [None]:
# Testing Different Regression Algorithms to choose the best one

lr_pipeline = Pipeline([("scaler", StandardScaler()), ("linear_regression", LinearRegression())])
rb_pipeline = Pipeline([("scaler", StandardScaler()), ("robust_regression", RANSACRegressor(random_state = 42))])
thiel_pipeline = Pipeline([("scaler", StandardScaler()), ("thiel_regressor", TheilSenRegressor(random_state = 42))])
ridge_pipeline = Pipeline([("scaler", StandardScaler()), ("ridge_regressor", Ridge(random_state = 42))])
lasso_pipeline = Pipeline([("scaler", StandardScaler()), ("lasso_regressor", Lasso(random_state = 42))])
elastic_pipeline = Pipeline([("scaler", StandardScaler()), ("elastic_net", ElasticNet(random_state = 42))])
random_forest_pipeline = Pipeline([("scaler", StandardScaler()), ("randomforest_regression", RandomForestRegressor(random_state = 42))])
xgboost_pipeline = Pipeline([("scaler", StandardScaler()), ("xgboost_regression", XGBRegressor())])
adaboost_pipeline = Pipeline([("scaler", StandardScaler()), ("adaboost_regression", AdaBoostRegressor(random_state = 42))])
gradient_pipeline = Pipeline([("scaler", StandardScaler()), ("gradientboost_regression", GradientBoostingRegressor(random_state = 42))])
lightgbm_pipeline = Pipeline([("scaler", StandardScaler()), ("lightgbm_regression", LGBMRegressor(random_state = 42))])
catboost_pipeline = Pipeline([("scaler", StandardScaler()), ("catboost_regression", CatBoostRegressor(random_state = 42, silent = True))])
decisiontree_pipeline = Pipeline([("scaler", StandardScaler()), ("decisiontree_regression", DecisionTreeRegressor(random_state = 42))])
knn_pipeline = Pipeline([("scaler", StandardScaler()), ("knn_regression", KNeighborsRegressor())])
sgc_pipeline = Pipeline([("scaler", StandardScaler()), ("sgd_regression", SGDRegressor(random_state = 42))])

In [None]:
pipelines = [lr_pipeline, rb_pipeline, thiel_pipeline, ridge_pipeline, lasso_pipeline,
            elastic_pipeline, random_forest_pipeline, xgboost_pipeline, adaboost_pipeline, gradient_pipeline, lightgbm_pipeline, catboost_pipeline, decisiontree_pipeline, knn_pipeline, sgc_pipeline]

In [None]:
pipe_dict = {0: "Linear Regression", 1: "Robust", 2: "Theil Sen", 3: "Ridge",
            4: "Lasso", 5: "ElasticNet", 6: "RandomForest", 7: "XGBoost",
             8: "Adaboost", 9: "GradientBoost", 10: "LightGBM", 11: "CatBoost", 12: "Decision Tree", 13: "KNN", 14: "SGD"}

In [None]:
scores_df = pd.DataFrame(columns = ["Model", "CVScores"])

In [None]:
for i, pipe in enumerate(pipelines):
    score = cross_val_score(pipe, X, y, cv = 10)
    print(pipe_dict[i], ": ", score.mean())

Based on the above cross validation, we can see that **Linear Regression**, **Theil Sen Regressor**, **Ridge**, **Lasso**, **Stochastic Gradient Descent** works best among the all. But for our model, we will be going to choose **Linear Regression**, **Ridge** & **Lasso** Regression Models to perform hyperparameter tuning to find the best model among three.

# Cross Validation - Randomized Search CV

In [None]:
pipeline_new = Pipeline([("scaler", StandardScaler()), ("classifier", LinearRegression())])

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
grid_params = [
    {"classifier": [LinearRegression()],
    "classifier__normalize": [True, False]},
    {"classifier": [Ridge()],
     "classifier__alpha": np.arange(0.5, 5, 0.25),
     "classifier__fit_intercept": [True, False],
     "classifier__normalize": [True, False],
     "classifier__max_iter": [50, 100, 500, 1000, 2500],
     "classifier__tol": [0.001, 0.01, 0.05, 0.1, 1, 5, 10],
     "classifier__solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg,", "sag", "saga"]
    },
    {"classifier": [Lasso()],
    "classifier__alpha": np.arange(0.5, 5, 0.25),
    "classifier__fit_intercept": [True, False],
    "classifier__normalize": [True, False],
    "classifier__max_iter": [50, 100, 500, 1000, 2500],
    "classifier__tol": [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 1, 5, 10],
    "classifier__warm_start": [True, False],
    "classifier__positive": [True, False],
    "classifier__selection": ["cyclic", "random"]}
]

In [None]:
random_search = RandomizedSearchCV(estimator = pipeline_new, param_distributions = grid_params, scoring = 'neg_mean_absolute_error', n_jobs= -1, cv = 8, verbose = 10, random_state = 42)

In [None]:
best_model = random_search.fit(X_train, y_train)

In [None]:
best_model.best_params_

In [None]:
pipeline_lasso = Pipeline([('scaler', StandardScaler()), ('lasso_reg', Lasso(alpha = 1.25, fit_intercept=True, max_iter=50, normalize=True, tol = 0.0001, warm_start = True, selection = 'cyclic', positive = True))])

In [None]:
model = pipeline_lasso.fit(X_train, y_train)

In [None]:
predict = model.predict(X_test)

In [None]:
resid = (y_test - predict)

# Prediction Diagnosis

In [None]:
def print_evaluate(true, prediction):
    mae = mean_absolute_error(true, prediction)
    mse = mean_squared_error(true, prediction)
    rmse = np.sqrt(mean_squared_error(true, prediction))
    r2_square = r2_score(true, prediction)

    print("MAE: {}".format(mae))
    print("MSE: {}".format(mse))
    print("RMSE: {}".format(rmse))
    print("R2 Square: {}".format(r2_square))

In [None]:
def plot_diag(true, prediction):
    
    #residuals
    resid = (true-prediction)
    
    #plotting the distplot
    plt.figure(figsize=(14,8))
    plt.subplot(221)
    plt.title("Distplot")
    sns.distplot(resid)
    
    #plotting the residual plot
    plt.subplot(222)
    plt.title("Residual Plot")
    sns.scatterplot(prediction, resid)
    sns.lineplot([min(prediction), max(prediction)], y = [0,0], linestyle = '--', color = 'r')
    
    #plotting the quantile plot
    ax = plt.subplot(223)
    plt.title("Quantile Plot")
    sm.qqplot(resid, line = 'r', ax = ax)
    
    #plotting the autocorrelation plot
    ax2 = plt.subplot(224)
    plt.title("Autocorrelation Plot")
    plot_acf(resid, ax = ax2)
    plt.tight_layout()

In [None]:
plot_diag(y_test, predict)

In [None]:
sns.scatterplot(y_test, predict)

In [None]:
print_evaluate(y_test, predict)

In [None]:
X_test["Predicted Price"] = predict

In [None]:
X_test["Original Price"] = y_test

In [None]:
X_test["Predicted Price"] = rounder(X_test["Predicted Price"])

In [None]:
X_test

## I hope you like this notebook...!!!!!