<a href="https://colab.research.google.com/github/sasansharee/Sasan_MMA_Projects/blob/main/Auto_Scout_Project_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
df_org = pd.read_csv('/content/df_33_cleaned_filled_outlier_dummies.csv')

In [None]:
df = df_org.copy()

In [None]:
X = df.drop(['price'], axis = 1)
y = df['price']

In [None]:
X.shape, y.shape

**Polynomia Conversion**

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree = 3, include_bias = False)

In [None]:
# X_poly = poly.fit_transform(X), Cannot run this due to the large size of the data. We will NOT run polynomial for this data

**Train_Test_Split**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

**Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
np.mean(X_train_scaled).round(), np.mean(X_test_scaled).round()

In [None]:
np.std(X_train_scaled).round(), np.std(X_test_scaled).round()

In [None]:
pd.DataFrame(X_train_scaled).agg({'mean', 'std'}).round()

In [None]:
pd.DataFrame(X_test_scaled).agg({'mean', 'std'}).round()

**Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear = LinearRegression()

In [None]:
linear.fit(X_train_scaled, y_train)

In [None]:
y_train_pred = linear.predict(X_train_scaled)

In [None]:
y_pred = linear.predict(X_test_scaled)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
def train_val(y_train, y_train_pred, y_test, y_pred, name):

  scores = {
  name + '_train' : {
  'R2' : r2_score(y_train, y_train_pred),
  'MAE' : mean_absolute_error(y_train, y_train_pred),
  'MSE' : mean_squared_error(y_train, y_train_pred),
  'rMSE' : np.sqrt(mean_squared_error(y_train, y_train_pred))},

  name + '_test' : {
  'R2' : r2_score(y_test, y_pred),
  'MAE' : mean_absolute_error(y_test, y_pred),
  'MSE' : mean_squared_error(y_test, y_pred),
  'rMSE' : np.sqrt(mean_squared_error(y_test, y_pred))}
  }

  return pd.DataFrame(scores)


In [None]:
score_linear = train_val(y_train, y_train_pred, y_test, y_pred, 'linear')
score_linear

**Multicolinearity**

In [None]:
def color_red(val):
  if val > 0.9 and val < 0.99:
    color = 'red'
  else:
    color = 'black'
  return f'color : {color}'

In [None]:
pd.DataFrame(X).style.applymap(color_red)

**Cross Validation**

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
score_linear_crval = cross_validate(linear, X_train_scaled, y_train, scoring = ['r2', 'neg_mean_absolute_error',
                                                                             'neg_mean_squared_error',
                                                                             'neg_root_mean_squared_error'], cv = 5)

In [None]:
score_linear_crval

In [None]:
pd.DataFrame(score_linear_crval, index = range(1, 6))

In [None]:
score_linear_crval_df = pd.DataFrame(score_linear_crval, index = range(1, 6))
score_linear_crval_df.iloc[:, 2:].mean()

In [None]:
linear.coef_

In [None]:
linear_coef_df = pd.DataFrame(linear.coef_, columns = ['linear_coef'])
linear_coef_df

**Ridge**

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge = Ridge(alpha = 1, random_state = 42)

In [None]:
ridge.fit(X_train_scaled, y_train)

In [None]:
y_train_pred = ridge.predict(X_train_scaled)
y_pred = ridge.predict(X_test_scaled)

In [None]:
score_ridge = train_val(y_train, y_train_pred, y_test, y_pred, 'ridge')
score_ridge

In [None]:
pd.concat([score_linear, score_ridge], axis = 1)

**Ridge Cross Validation, alpha = 1**

In [None]:
score_ridge_crval = cross_validate(ridge, X_train_scaled, y_train, scoring = ['r2', 'neg_mean_absolute_error',
                                                                           'neg_mean_squared_error',
                                                                           'neg_root_mean_squared_error'], cv = 5)

In [None]:
score_ridge_crval

In [None]:
score_ridge_crval_df = pd.DataFrame(score_ridge_crval, index = range(1, 6))
score_ridge_crval_df

In [None]:
score_ridge_crval_df.iloc[:, 2:].mean()

In [None]:
ridge.coef_

In [None]:
ridge_coef_df = pd.DataFrame(ridge.coef_, columns = ['ridge_coef'])
ridge_coef_df

In [None]:
pd.concat([linear_coef_df, ridge_coef_df], axis = 1)

**Choosing best alpha value with Cross-Validation**

In [None]:
from sklearn.linear_model import RidgeCV

In [None]:
alpha_range = np.linspace(0.01, 1, 100)

In [None]:
ridge_cv = RidgeCV(alphas = alpha_range, cv = 5, scoring = 'neg_root_mean_squared_error')

In [None]:
ridge_cv.fit(X_train_scaled, y_train)

In [None]:
ridge_cv.alpha_

In [None]:
ridge_cv.best_score_

In [None]:
y_train_pred = ridge_cv.predict(X_train_scaled)
y_pred = ridge_cv.predict(X_test_scaled)

In [None]:
score_ridge_cv = train_val(y_train, y_train_pred, y_test, y_pred, 'ridge_cv')
score_ridge_cv

In [None]:
pd.concat([score_linear, score_ridge, score_ridge_cv], axis = 1)

In [None]:
ridge_cv.coef_

In [None]:
ridge_cv_coef_df = pd.DataFrame(ridge_cv.coef_, columns = ['ridge_cv_coef'])

In [None]:
pd.concat([linear_coef_df, ridge_coef_df, ridge_cv_coef_df], axis = 1)

**Lasso Regression**

In [None]:
from sklearn.linear_model import Lasso, LassoCV

In [None]:
lasso = Lasso(alpha = 1, random_state = 42)

In [None]:
lasso.fit(X_train_scaled, y_train)

In [None]:
y_train_pred = lasso.predict(X_train_scaled)
y_pred = lasso.predict(X_test_scaled)

In [None]:
score_lasso = train_val(y_train, y_train_pred, y_test, y_pred, 'lasso')
score_lasso

In [None]:
pd.concat([score_linear, score_ridge, score_ridge_cv, score_lasso], axis = 1)

**Lasso Cross Validation, alpha = 1**

In [None]:
scores_lasso_crval = cross_validate(lasso, X_train_scaled, y_train, scoring = ['r2', 'neg_mean_absolute_error',
                                                                               'neg_mean_squared_error',
                                                                               'neg_root_mean_squared_error'], cv = 5)

In [None]:
scores_lasso_crval_df = pd.DataFrame(scores_lasso_crval, index = range(1, 6))
scores_lasso_crval_df

In [None]:
scores_lasso_crval_df.iloc[:, 2:].mean()

In [None]:
lasso.coef_

In [None]:
lasso_coef_df = pd.DataFrame(lasso.coef_, columns = ['lasso_coef'])
lasso_coef_df

In [None]:
pd.concat([linear_coef_df, ridge_coef_df, ridge_cv_coef_df, lasso_coef_df], axis = 1)

**Choosing best alpha value with Cross-Validation for Lasso**

In [None]:
lasso_cv = LassoCV(alphas = alpha_range, cv = 5, random_state = 42)

In [None]:
lasso_cv.fit(X_train_scaled, y_train)

In [None]:
lasso_cv.alpha_

In [None]:
y_train_pred = lasso_cv.predict(X_train_scaled)
y_pred = lasso_cv.predict(X_test_scaled)

In [None]:
score_lasso_cv = train_val(y_train, y_train_pred, y_test, y_pred, 'lasso_cv')
score_lasso_cv

In [None]:
pd.concat([score_linear, score_ridge, score_ridge_cv, score_lasso, score_lasso_cv], axis = 1)

In [None]:
lasso_cv.coef_

In [None]:
lasso_cv_coef_df = pd.DataFrame(lasso_cv.coef_, columns = ['lasso_cv_coef'])
lasso_cv_coef_df

In [None]:
pd.concat([linear_coef_df, ridge_coef_df, ridge_cv_coef_df, lasso_coef_df, lasso_cv_coef_df], axis = 1)

**Elastic Net**

In [None]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

In [None]:
elastic = ElasticNet(alpha = 1, l1_ratio = 0.5, random_state = 42)

In [None]:
elastic.fit(X_train_scaled, y_train)

In [None]:
y_train_pred = elastic.predict(X_train_scaled)
y_pred = elastic.predict(X_test_scaled)

In [None]:
score_elastic = train_val(y_train, y_train_pred, y_test, y_pred, 'elastic')
score_elastic

In [None]:
pd.concat([score_linear, score_ridge, score_ridge_cv, score_lasso, score_lasso_cv, score_elastic], axis = 1)

**Elastic Net Cross Validation with alpha = 1, and l1_ratio = 0.5**

In [None]:
scores_elastic_crval = cross_validate(elastic, X_train_scaled, y_train, scoring = ['r2', 'neg_mean_absolute_error',
                                                                                 'neg_mean_squared_error',
                                                                                 'neg_root_mean_squared_error'], cv = 5)

In [None]:
scores_elastic_crval_df = pd.DataFrame(scores_elastic_crval, index = range(1, 6))

In [None]:
scores_elastic_crval_df.iloc[:, 2:]

In [None]:
scores_elastic_crval_df.iloc[:, 2:].mean()

In [None]:
elastic.coef_

In [None]:
elastic_coef_df = pd.DataFrame(elastic.coef_, columns = ['elastic_coef'])
elastic_coef_df

In [None]:
pd.concat([linear_coef_df, ridge_coef_df, ridge_cv_coef_df, lasso_coef_df, lasso_cv_coef_df, elastic_coef_df], axis = 1)

**Finding best alpha and l1_ratio using Cross validation**

In [None]:
elastic_cv = ElasticNetCV(alphas = alpha_range, l1_ratio=[0.1, 0.5, 0.7,0.9, 0.95, 1], cv = 5,
                                max_iter = 100000,random_state=42)

In [None]:
elastic_cv.fit(X_train_scaled, y_train)

In [None]:
elastic_cv.alpha_

In [None]:
elastic_cv.l1_ratio_

In [None]:
y_train_pred = elastic_cv.predict(X_train_scaled)
y_pred = elastic_cv.predict(X_test_scaled)

In [None]:
score_elastic_cv = train_val(y_train, y_train_pred, y_test, y_pred, "elastic_cv")
score_elastic_cv

In [None]:
pd.concat([score_linear, score_ridge, score_ridge_cv, score_lasso, score_lasso_cv, score_elastic, score_elastic_cv], axis = 1)

In [None]:
elastic_cv.coef_

In [None]:
elastic_cv_coef_df = pd.DataFrame(elastic_cv.coef_, columns = ['elastic_cv_coef'])

In [None]:
pd.concat([linear_coef_df, ridge_coef_df, ridge_cv_coef_df, lasso_coef_df, lasso_cv_coef_df, elastic_coef_df, elastic_cv_coef_df], axis = 1)

**GridSearch**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {"alpha":[0.01, 0.012, 0.2, 0.5, 0.6, 0.7, 1],
            "l1_ratio":[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]}

In [None]:
grid = GridSearchCV(elastic, param_grid, scoring = 'neg_root_mean_squared_error', cv = 5, verbose = 2)

In [None]:
grid.fit(X_train_scaled, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
y_train_pred = grid.predict(X_train_scaled)
y_pred = grid.predict(X_test_scaled)

In [None]:
score_grid = train_val(y_train, y_train_pred, y_test, y_pred, "GridSearch")
score_grid

In [None]:
pd.concat([score_linear, score_ridge, score_ridge_cv, score_lasso, score_lasso_cv, score_elastic, score_elastic_cv, score_grid], axis = 1)

**Feature importances with Ridge**

In [None]:
from yellowbrick.model_selection import FeatureImportances
from yellowbrick.features import RadViz

viz = FeatureImportances(ridge_cv, labels=pd.DataFrame(X_train).columns)
visualizer = RadViz(size=(720, 3000))
viz.fit(X_train, y_train)
viz.show()

**Feature importances with Lasso**

In [None]:
from yellowbrick.model_selection import FeatureImportances
from yellowbrick.features import RadViz

viz = FeatureImportances(lasso_cv, labels=pd.DataFrame(X_train).columns)
visualizer = RadViz(size=(720, 3000))
viz.fit(X_train, y_train)
viz.show()