# Import Libs

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR

import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max.columns', None)

# Data loading and overview

In [None]:
df = pd.read_csv('../input/usahousing/USA_Housing.csv')
df.head()

In [None]:
df.info()

# EDA
## Distplot
We look at the distribution

In [None]:
sns.distplot(df['Avg. Area Income'])

In [None]:
sns.distplot(df['Avg. Area House Age'])

In [None]:
sns.distplot(df['Avg. Area Number of Rooms'])

In [None]:
sns.distplot(df['Avg. Area Number of Bedrooms'])

In [None]:
sns.distplot(df['Area Population'])

In [None]:
sns.distplot(df['Price'])

Feature distribution is normal

## Boxplot

In [None]:
sns.boxplot(x='Avg. Area Income', data = df)

In [None]:
sns.boxplot(x='Avg. Area House Age', data = df)

In [None]:
sns.boxplot(x='Avg. Area Number of Rooms', data = df)

In [None]:
sns.boxplot(x='Avg. Area Number of Bedrooms', data = df)

In [None]:
sns.boxplot(x='Area Population', data = df)

In [None]:
sns.boxplot(x='Price', data = df)

We have outliers

# Preprocessing

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df = df.drop('Address', axis = 1)
df.head()

In [None]:
f, ax = plt.subplots(figsize=(10,8))
corr = df.corr()
sns.heatmap(corr, annot=True, mask=np.zeros_like(corr, dtype=np.bool),
           cmap = sns.diverging_palette(240, 10, as_cmap = True), 
           square = True, ax = ax)

In [None]:
df.corr()['Price'].sort_values()

In [None]:
std = StandardScaler()
df_std = std.fit_transform(df)
df_std = pd.DataFrame(df_std, columns = df.columns)

In [None]:
df_std.head()

# Modeling

In [None]:
X = df.drop('Price', axis = 1)
y = df.Price

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
regressor = sm.OLS(y_train, X_train).fit()
print(regressor.summary())

X_train_dropped = X_train.copy()

## The function removes features with high p-value

In [None]:
while True:
    if max(regressor.pvalues) > 0.05:
        drop_variable = regressor.pvalues[regressor.pvalues == max(regressor.pvalues)]
        print("Dropping " + drop_variable.index[0] + " and running regression again because pvalue is: " + str(drop_variable[0]))
        X_train_dropped = X_train_dropped.drop(columns = [drop_variable.index[0]])
        regressor = sm.OLS(y_train, X_train_dropped).fit()
    else:
        print("All p values less than 0.05")
        break


In [None]:
print(regressor.summary())


## Uses SelectKBest

In [None]:
column_names = df.drop(columns = ['Price']).columns

no_of_features = []
r_squared_train = []
r_squared_test = []

#Look at shape
for k in range(1, 5):
    selector = SelectKBest(f_regression, k = k)
    X_train_transformed = selector.fit_transform(X_train, y_train)
    X_test_transformed = selector.transform(X_test)
    regressor = LinearRegression()
    regressor.fit(X_train_transformed, y_train)
    no_of_features.append(k)
    r_squared_train.append(regressor.score(X_train_transformed, y_train))
    r_squared_test.append(regressor.score(X_test_transformed, y_test))
    
sns.lineplot(x = no_of_features, y = r_squared_train, legend = 'full')
sns.lineplot(x = no_of_features, y = r_squared_test, legend = 'full')

In [None]:
# k = 4 because look at orange line
selector = SelectKBest(f_regression, k = 4)
X_train_transformed = selector.fit_transform(X_train, y_train)
X_test_transformed = selector.transform(X_test)
column_names[selector.get_support()]


In [None]:
def regression_model(model):
    """
    Will fit the regression model passed and will return the regressor object and the score
    """
    regressor = model
    regressor.fit(X_train_transformed, y_train)
    score = regressor.score(X_test_transformed, y_test)
    return regressor, score

In [None]:
model_performance = pd.DataFrame(columns = ["Features", "Model", "Score"])

models_to_evaluate = [LinearRegression(), Ridge(), Lasso(), SVR(), RandomForestRegressor(), MLPRegressor(), XGBRegressor(), GradientBoostingRegressor()]

for model in models_to_evaluate:
    regressor, score = regression_model(model)
    model_performance = model_performance.append({"Features": "Linear","Model": model, "Score": round(score, 2)}, ignore_index=True)

model_performance.sort_values(by = 'Score', ascending = False)

In [None]:
poly = PolynomialFeatures()
X_train_transformed_poly = poly.fit_transform(X_train)
X_test_transformed_poly = poly.transform(X_test)

print(X_train_transformed_poly.shape)

no_of_features = []
r_squared = []

for k in range(3, 21):
    selector = SelectKBest(f_regression, k = k)
    X_train_transformed = selector.fit_transform(X_train_transformed_poly, y_train)
    regressor = LinearRegression()
    regressor.fit(X_train_transformed, y_train)
    no_of_features.append(k)
    r_squared.append(regressor.score(X_train_transformed, y_train))
    
sns.lineplot(x = no_of_features, y = r_squared)

In [None]:
selector = SelectKBest(f_regression, k = 16)
X_train_transformed = selector.fit_transform(X_train_transformed_poly, y_train)
X_test_transformed = selector.transform(X_test_transformed_poly)

In [None]:
models_to_evaluate = [LinearRegression(), Ridge(), Lasso(), SVR(), RandomForestRegressor(), MLPRegressor(), XGBRegressor(), GradientBoostingRegressor()]

for model in models_to_evaluate:
    regressor, score = regression_model(model)
    model_performance = model_performance.append({"Features": "Polynomial","Model": model, "Score": round(score, 2)}, ignore_index=True)

model_performance.sort_values(by = 'Score', ascending = False)


# Thanks for watching!
## If you liked notebook then upvoted it or write your opinion