In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import interact, FloatSlider
from IPython.display import display, clear_output

import os

root = os.path.join('/', 'Users', 'jpcohen', 'MEDS', 'eds_232', 'eds-232-discussion')
data = os.path.join(root, 'data', 'Hurricane Irene and the Hudson River.xlsx')

In [39]:
hurricane_do = pd.read_excel(data, 5)
hurricane_turbidity = pd.read_excel(data, 2)
hurricane_rainfall = pd.read_excel(data, 1)
df = hurricane_do.join(hurricane_turbidity, lsuffix = '', rsuffix = '_d').join(hurricane_rainfall, lsuffix = '', rsuffix = '_dd').drop(['Date Time (ET)_d', 'Date Time (ET)_dd', 'Piermont D.O. (ppm)', 'Piermont Turbidity in NTU', 'Piermont  Rainfall Daily Accumulation (Inches)'], axis = 1).rename(columns = lambda x: x.strip()).set_index('Date Time (ET)')

df.columns = ['albany_do', 'norrie_do', 'albany_turbidity', 'norrie_turbidity', 'albany_rainfall', 'norrie_rainfall']

In [41]:
df

Unnamed: 0_level_0,albany_do,norrie_do,albany_turbidity,norrie_turbidity,albany_rainfall,norrie_rainfall
Date Time (ET),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-08-25 00:00:00,7.68,7.81,4.0,9.3,0.000000,0.000000
2011-08-25 00:15:00,7.60,7.73,3.9,8.4,0.000000,0.000000
2011-08-25 00:30:00,7.57,7.63,4.3,7.9,0.000000,0.000000
2011-08-25 00:45:00,7.72,7.67,4.7,8.1,0.000000,0.000000
2011-08-25 01:00:00,7.74,7.63,4.4,8.4,0.000000,0.000000
...,...,...,...,...,...,...
2011-09-05 22:45:00,8.73,6.84,47.2,144.1,0.629999,1.219998
2011-09-05 23:00:00,8.76,6.78,56.7,139.7,0.639999,1.239998
2011-09-05 23:15:00,8.66,6.83,47.0,141.2,0.649999,1.259997
2011-09-05 23:30:00,8.75,6.79,48.7,127.9,0.679999,1.269997


Multiple Linear Regression

Now that our data is cleaned, let’s do the following to carry out a multiple linear regression. we will be practicing

In [48]:
# define predictors and the target variable
x = df[['albany_rainfall', 'albany_do']]
y = df[['albany_turbidity']]

# split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

# create and fit the model
model = LinearRegression().fit(x_train, y_train)
model

# predict and evaluate
y_pred = model.predict(x_test)
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"R-squared: {r2_score(y_test, y_pred)}")

RMSE: 221.9143474905527
R-squared: 0.490738951845751


# create a widget to visualize different models

In [67]:
# create a widget for selecting predictors
predictor_selector = widgets.SelectMultiple(
    options = df.columns,
    value = [df.columns[0]], 
    description = 'Predictors'
)

# create a dropdown for selecting the target variable
target_selector = widgets.Dropdown(
    options = df.columns,
    value = df.columns[1],
    description = 'Target'
)

# button to evaluate the model
evaluate_button = widgets.Button(description = 'Evaluate Model')

# output widget to display results
output = widgets.Output()

#Define the function to handle button clicks
def evaluate_model(b):
    with output:
        clear_output(wait = True) # clear output of display area
        
        # make sure the target is not in the predictors
        selected_predictors = [item for item in predictor_selector.value]
        if target_selector.value in selected_predictors :
            print("Target variable must not be in the predictors.")
            return
        
        # prepare the data
        x = df[selected_predictors]
        y = df[target_selector.value]
        
        # split data into training and testing sets
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
        
        # create and fit the model
        model = LinearRegression()
        model.fit(x_train, y_train)
        
        # predict and calculate R^2 and mse
        y_pred = model.predict(x_test)
        r2 = r2_score(y_test, y_pred)
        root_mse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        # Display the R^2 score and mse
        print(f"R^2: {r2:.4f}")
        print(f"MSE: {root_mse:.4f}")
        
# display the widgets and connect the button to the function
display(predictor_selector, target_selector, evaluate_button, output)
evaluate_button.on_click(evaluate_model)

SelectMultiple(description='Predictors', index=(0,), options=('albany_do', 'norrie_do', 'albany_turbidity', 'n…

Dropdown(description='Target', index=1, options=('albany_do', 'norrie_do', 'albany_turbidity', 'norrie_turbidi…

Button(description='Evaluate Model', style=ButtonStyle())

Output()