# Linear Regression

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict, train_test_split
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [22]:
def linear_reg(X_train, y_train, cv_folds=5, decimal_places=2):
    regr = LinearRegression()
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Cross-validation
    mae_scores = -cross_val_score(regr, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')
    medae_scores = -cross_val_score(regr, X_train, y_train, cv=kf, scoring='neg_median_absolute_error')
    r2_scores = cross_val_score(regr, X_train, y_train, cv=kf, scoring='r2')
   
    # Dictionary of errors then convert to dataframe
    error_metrics = {
        'MAE': mae_scores,
        #'MAE': [np.mean(mae_scores)],
        'medAE': medae_scores,
        #'medAE': [np.mean(medae_scores)],
        'R2': r2_scores
        #'R2': [np.mean(r2_scores)]
    }
    
    # Convert the dictionary to a pandas DataFrame and round the values
    df_errors = pd.DataFrame(error_metrics).round(decimal_places)
    
    # prediction via cross validation
    y_pred = cross_val_predict(regr, X_train, y_train, cv=kf)
    
    return y_pred, df_errors

In [23]:
"""def linear_reg_scaled(X_train, y_train, cv_folds=5, decimal_places=2):
    # Scaling Data
    scale_X = StandardScaler()
    scale_y = StandardScaler()

    # Fit the scaler on the training data and transform the training data
    X_train_scaled = scale_X.fit_transform(X_train)
    y_train_scaled = scale_y.fit_transform(y_train.reshape(-1, 1)).flatten()

    regr = LinearRegression()
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    
    # prediction via cross validation
    y_pred_scaled = cross_val_predict(regr, X_train_scaled, y_train_scaled, cv=kf)
    
    # Inverse transform the predictions to the original scale  
    y_pred = scale_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    
    # Cross-validation
    mae_scores = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_mean_absolute_error')
    medae_scores = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_median_absolute_error')
    r2_scores = cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='r2')
    
    # Dictionary of errors then convert to dataframe
    error_metrics = {
        'MAE': mae_scores,
        #'MAE': [np.mean(mae_scores)],
        'medAE': medae_scores,
        #'medAE': medae_scores,
        'R2': r2_scores
        #'R2': [np.mean(r2_scores)]
    }
    
    # Convert the dictionary to a pandas DataFrame and round the values
    df_errors = pd.DataFrame(error_metrics).round(decimal_places)
     
    return #y_pred, df_errors, scale_X, scale_y
"""


"def linear_reg_scaled(X_train, y_train, cv_folds=5, decimal_places=2):\n    # Scaling Data\n    scale_X = StandardScaler()\n    scale_y = StandardScaler()\n\n    # Fit the scaler on the training data and transform the training data\n    X_train_scaled = scale_X.fit_transform(X_train)\n    y_train_scaled = scale_y.fit_transform(y_train.reshape(-1, 1)).flatten()\n\n    regr = LinearRegression()\n    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)\n    \n    \n    # prediction via cross validation\n    y_pred_scaled = cross_val_predict(regr, X_train_scaled, y_train_scaled, cv=kf)\n    \n    # Inverse transform the predictions to the original scale  \n    y_pred = scale_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()\n    \n    # Cross-validation\n    mae_scores = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_mean_absolute_error')\n    medae_scores = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_median_abso

In [31]:
def linear_reg_scaled(X_train, y_train, cv_folds=5, decimal_places=2):
    # Scaling Data
    scale_X = StandardScaler()
    scale_y = StandardScaler()
   
    # Fit the scaler on the training data and transform the training data
    X_train_scaled = scale_X.fit_transform(X_train)
    y_train_scaled = scale_y.fit_transform(y_train.reshape(-1, 1)).flatten()

    regr = LinearRegression()
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    
    # prediction via cross validation
    y_pred_scaled = cross_val_predict(regr, X_train_scaled, y_train_scaled, cv=kf)
    
    # Inverse transform the predictions to the original scale  
    y_pred = scale_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

    # Cross-validation
    mae_scores = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_mean_absolute_error')
    medae_scores = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_median_absolute_error')
    r2_scores = cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='r2')
    
    # Dictionary of errors then convert to dataframe
    error_metrics = {
        'MAE': mae_scores,
        #'MAE': [np.mean(mae_scores)],
        'medAE': medae_scores,
        #'medAE': medae_scores,
        'R2': r2_scores
        #'R2': [np.mean(r2_scores)]
    }
    
    # Convert the dictionary to a pandas DataFrame and round the values
    df_errors = pd.DataFrame(error_metrics).round(decimal_places)
     
    return y_pred, df_errors, scale_X, scale_y

'"    \n    # Calculate error metrics\n    mse = mean_squared_error(y_train, y_pred)\n    rmse = np.sqrt(mse)\n    mae = mean_absolute_error(y_train, y_pred)\n\n    # Create a DataFrame with error metrics\n    df_errors = pd.DataFrame({\n        \'Metric\': [\'MSE\', \'RMSE\', \'MAE\'],\n        \'Value\': [round(mse, decimal_places), round(rmse, decimal_places), round(mae, decimal_places)]\n    })\n\n    return y_pred, df_errors, scale_X, scale_y'