# Linear Regression

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
import matplotlib.pyplot as plt

In [3]:
def linear_reg(X_train, y_train, cv_folds=5, decimal_places=2):
    regr = LinearRegression()
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Cross-validation
    mae_scores = -cross_val_score(regr, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')
    medae_scores = -cross_val_score(regr, X_train, y_train, cv=kf, scoring='neg_median_absolute_error')
    r2_scores = cross_val_score(regr, X_train, y_train, cv=kf, scoring='r2')
   
    # Dictionary of errors then convert to dataframe
    error_metrics = {
        'MAE': mae_scores,
        #'MAE': [np.mean(mae_scores)],
        'medAE': medae_scores,
        #'medAE': [np.mean(medae_scores)],
        'R2': r2_scores
        #'R2': [np.mean(r2_scores)]
    }
    
    # Convert the dictionary to a pandas DataFrame and round the values
    df_errors = pd.DataFrame(error_metrics).round(decimal_places)
    
    # prediction via cross validation
    y_pred = cross_val_predict(regr, X_train, y_train, cv=kf)
    
    return y_pred, df_errors

In [1]:
def linear_reg_scaled(X_train, y_train, cv_folds=5, decimal_places=2):
    # Scaling Data
    scale_X = StandardScaler()
    scale_y = StandardScaler()

    # Fit the scaler on the training data and transform the training data
    X_train_scaled = scale_X.fit_transform(X_train)
    y_train_scaled = scale_y.fit_transform(y_train.reshape(-1, 1)).flatten()
    
    regr = LinearRegression()
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Cross-validation
    mae_scores = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_mean_absolute_error')
    medae_scores = -cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_median_absolute_error')
    r2_scores = cross_val_score(regr, X_train_scaled, y_train_scaled, cv=kf, scoring='r2')
    
    # Dictionary of errors then convert to dataframe
    error_metrics = {
        'MAE': mae_scores,
        #'MAE': [np.mean(mae_scores)],
        'medAE': medae_scores,
        #'medAE': medae_scores,
        'R2': r2_scores
        #'R2': [np.mean(r2_scores)]
    }
    
    # Convert the dictionary to a pandas DataFrame and round the values
    df_errors = pd.DataFrame(error_metrics).round(decimal_places)
    
    # prediction via cross validation
    y_pred_scaled = cross_val_predict(regr, X_train_scaled, y_train_scaled, cv=kf)

    #TODO: y_pred to be scaled or not????
    
    # Inverse transform the predictions to the original scale  
    y_pred = scale_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    
    return y_pred, df_errors, scale_X, scale_y



NameError: name 'train_test_split' is not defined