# Linear Regression

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict, train_test_split
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score

In [2]:
def linear_reg(X_train, X_test, y_train, y_test, cv_folds=5, decimal_places=2):
    regr = LinearRegression()
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Cross-validation
    mae_scores = -cross_val_score(regr, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')
    medae_scores = -cross_val_score(regr, X_train, y_train, cv=kf, scoring='neg_median_absolute_error')
    r2_scores = cross_val_score(regr, X_train, y_train, cv=kf, scoring='r2')
   
    # prediction via cross validation
    y_pred_cv = cross_val_predict(regr, X_train, y_train, cv=kf)
    
    # Clip predictions to be non-negative
    y_pred_cv = np.clip(y_pred_cv, 0, None)
    
    # Cross Validation errors on training set
    mae_cv = mean_absolute_error(y_train, y_pred_cv)
    medAE_cv = median_absolute_error(y_train,y_pred_cv)
    
    # fitting model on all data
    regr.fit(X_train,y_train)
    y_pred_test = regr.predict(X_test)
    
    # Clip predictions to be non-negative
    y_pred_test = np.clip(y_pred_test, 0, None)

    # Erros on the test set
    mae_test = mean_absolute_error(y_test,y_pred_test)
    medae_test = median_absolute_error(y_test, y_pred_test)
    r2_test = r2_score(y_test,y_pred_test)

    # Dictionary of errors then convert to dataframe
    error_metrics = {
        'MAE CV': mae_cv,
        'medAE CV': medAE_cv,
        'R2 CV': r2_scores.mean(),
        'MAE Train (mean)': mae_scores.mean(),
        'medAE Train (mean)': medae_scores.mean(),
        'MAE Test': mae_test,
        'medAE Test': medae_test,
        'R2 Test': r2_test
    }
    
    # Convert the dictionary to a pandas DataFrame and round the values
    df_errors = pd.DataFrame([error_metrics]).round(decimal_places)
    
    return y_pred_test, df_errors

In [3]:
def linear_reg_scaled(X_train, X_test, y_train, y_test, cv_folds=5, decimal_places=2):
    # Scaling Data
    scale_X = StandardScaler()
    scale_y = StandardScaler()
   
    # Fit the scaler on the training data and transform the training data
    X_train_scaled = scale_X.fit_transform(X_train)
    y_train_scaled = scale_y.fit_transform(y_train.reshape(-1, 1)).flatten()
    X_test_scaled = scale_X.transform(X_test)
    y_test_scaled = scale_y.transform(y_test.reshape(-1, 1)).flatten()

    regr = LinearRegression()
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Cross-validation scores on training data
    mae_scores_scaled = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_mean_absolute_error')
    medae_scores_scaled = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_median_absolute_error')
    r2_scores = cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='r2')
    
    # Prediction via cross validation
    y_pred_scaled_cv = cross_val_predict(regr, X_train_scaled, y_train_scaled, cv=kf)
    y_pred_cv = scale_y.inverse_transform(y_pred_scaled_cv.reshape(-1, 1)).flatten()
    
    # Clip predictions to be non-negative
    y_pred_cv = np.clip(y_pred_cv, 0, None)

    # Cross Validation errors on training set
    mae_cv = metrics.mean_absolute_error(y_train, y_pred_cv)
    medAE_cv = metrics.median_absolute_error(y_train, y_pred_cv)
    
    # Fitting model on all data
    regr.fit(X_train_scaled, y_train_scaled)
    y_pred_scaled_test = regr.predict(X_test_scaled)
    y_pred_test = scale_y.inverse_transform(y_pred_scaled_test.reshape(-1, 1)).flatten()
    
    # Clip predictions to be non-negative
    y_pred_test = np.clip(y_pred_test, 0, None)

    # Errors on the test set
    mae_test = metrics.mean_absolute_error(y_test, y_pred_test)
    medae_test = metrics.median_absolute_error(y_test, y_pred_test)
    r2_test = r2_score(y_test,y_pred_test)
    
    # Transforming scaled MAE to original scale
    y_train_std = np.std(y_train)
    mae_scores = mae_scores_scaled * y_train_std
    medae_scores = medae_scores_scaled * y_train_std
    
    error_metrics = {
        'MAE CV': mae_cv,
        'medAE CV': medAE_cv,
        'R2 CV': r2_scores.mean(),
        'MAE Train (mean)': mae_scores.mean(),
        'medAE Train (mean)': medae_scores.mean(),
        'MAE Test': mae_test,
        'medAE Test': medae_test,
        'R2 Test': r2_test
    }
    
    # Convert the dictionary to a pandas DataFrame and round the values
    df_errors = pd.DataFrame([error_metrics]).round(decimal_places)
    
    return y_pred_test, df_errors, scale_X, scale_y