In [None]:
import pandas as pd
import numpy as np
import wrangle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import sklearn

# modeling methods
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

from pydataset import data
import warnings
warnings.filterwarnings("ignore")

import math
import feature_selection
from sklearn.metrics import mean_squared_error

from sklearn.metrics import explained_variance_score

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures



In [None]:
df = data('urine')

In [None]:
df.osmo = df.osmo.fillna(df.osmo.mean())

In [None]:
df.cond = df.cond.fillna(df.cond.mean())

In [None]:
# Split the data
train, validate, test = feature_selection.split(df, stratify_by="urea")

# Setup X and y
X_train = train.drop(columns='urea')
y_train = train.urea

X_validate = validate.drop(columns='urea')
y_validate = validate.urea

X_test = test.drop(columns='urea')
y_test = test.urea

In [None]:
# Scale the data
scaler = sklearn.preprocessing.MinMaxScaler()

# Fit the scaler
scaler.fit(X_train)

# Use the scaler to transform train, validate, test
X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)


# Turn everything into a dataframe
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_validate_scaled = pd.DataFrame(X_validate_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train.columns)

In [None]:
X_train_scaled.info()

In [None]:
from sklearn.feature_selection import SelectKBest

In [None]:
# Find the top 3 features using kbest
feature_selection.select_kbest(X_train_scaled, y_train, 3)

In [None]:
# Find the top 3 features using RFE
selected_features, all_rankings = feature_selection.select_rfe(X_train, y_train, 3)
print(selected_features)
all_rankings

In [None]:
# We need y_train and y_validate to be dataframes to append the new columns with predicted values. 
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

In [None]:
# 1. Predict urea_pred_mean
urea_pred_mean = y_train.urea.mean()
y_train['urea_pred_mean'] = urea_pred_mean
y_validate['urea_pred_mean'] = urea_pred_mean

In [None]:
# 2. compute urea_pred_median
urea_pred_median = y_train.urea.median()
y_train['urea_pred_median'] = urea_pred_median
y_validate['urea_pred_median'] = urea_pred_median

In [None]:
# 3. RMSE of urea_pred_mean
rmse_train = mean_squared_error(y_train.urea,
                                y_train.urea_pred_mean) ** .5
rmse_validate = mean_squared_error(y_validate.urea, y_validate.urea_pred_mean) ** (1/2)

In [None]:
print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

In [None]:
# 4. RMSE of urea_pred_median
rmse_train = mean_squared_error(y_train.urea, y_train.urea_pred_median) ** .5
rmse_validate = mean_squared_error(y_validate.urea, y_validate.urea_pred_median) ** .5
print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

In [None]:
def make_metric_df(y, y_pred, model_name, metric_df):
    if metric_df.size ==0:
        metric_df = pd.DataFrame(data=[
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }])
        return metric_df
    else:
        return metric_df.append(
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }, ignore_index=True)

In [None]:
# create the metric_df as a blank dataframe
metric_df = pd.DataFrame()
# make our first entry into the metric_df with median baseline
metric_df = make_metric_df(y_train.urea,
                           y_train.urea_pred_mean,
                           'mean_baseline',
                          metric_df)

In [None]:
metric_df

In [None]:
# plot to visualize actual vs predicted. 
plt.hist(y_train.urea, color='blue', alpha=.5, label="Actual urea")
plt.hist(y_train.urea_pred_mean, bins=1, color='red', alpha=.5, rwidth=100, label="Predicted urea - Mean")
plt.hist(y_train.urea_pred_median, bins=1, color='orange', alpha=.5, rwidth=100, label="Predicted urea - Median")
plt.xlabel("Urea Concentration")
plt.ylabel("Number of Patients")
plt.legend()
plt.show()

### LinearRegression (OLS)
1. Fit the model using X_train_scaled and the labels from y_train.
2. Predict final grade for Providences in training sample using our model (lm).
3. Evaluate using RMSE
4. Repeat predictions and evaluation for validation.
5. Compare RMSE train vs. validation. Overfitting?

In [None]:
y_train.head()

In [None]:
# create the model object
# 
# make the thing
# 
lm = LinearRegression(normalize=True)

In [None]:
# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
# 
# fit the thing
# 
lm.fit(X_train, y_train.urea)

In [None]:
# predict train
# 
# use the thing!
# 
y_train['urea_pred_lm'] = lm.predict(X_train)

In [None]:
# evaluate: rmse
rmse_train = mean_squared_error(y_train.urea, y_train.urea_pred_lm) ** (1/2)

# predict validate
y_validate['urea_pred_lm'] = lm.predict(X_validate)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.urea, y_validate.urea_pred_lm) ** (1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [None]:
metric_df = metric_df.append({
    'model': 'OLS Regressor', 
    'RMSE_validate': rmse_validate,
    'r^2_validate': explained_variance_score(y_validate.urea, y_validate.urea_pred_lm)}, ignore_index=True)

In [None]:
metric_df

### LassoLars
1. Fit the model using X_train_scaled and the labels from y_train.
2. Predict final grade for Providences in training sample using our model (lars).
3. Evaluate using RMSE
4. Repeat predictions and evaluation for validation.
5. Compare RMSE train vs. validation. Overfitting?

In [None]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series!
lars.fit(X_train, y_train.urea)

# predict train
y_train['urea_pred_lars'] = lars.predict(X_train)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.urea, y_train.urea_pred_lars) ** (1/2)

# predict validate
y_validate['urea_pred_lars'] = lars.predict(X_validate)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.urea, y_validate.urea_pred_lars) ** (1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [None]:
metric_df = make_metric_df(y_validate.urea,
               y_validate.urea_pred_lars,
               'lasso_alpha_1',
               metric_df)

In [None]:
metric_df

### TweedieRegressor (GLM)
1. Fit the model using X_train_scaled and the labels from y_train.
2. Predict final grade for Providences in training sample using our model (glm).
3. Evaluate using RMSE
4. Repeat predictions and evaluation for validation.
5. Compare RMSE train vs. validation. Overfitting?

In [None]:
from sklearn.linear_model import TweedieRegressor

In [None]:
# create the model object
glm = TweedieRegressor(power=1, alpha=0)


# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train, y_train.urea)

# predict train
y_train['urea_pred_glm'] = glm.predict(X_train)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.urea, y_train.urea_pred_glm) ** (1/2)

# predict validate
y_validate['urea_pred_glm'] = glm.predict(X_validate)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.urea, y_validate.urea_pred_glm) ** (1/2)

print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [None]:
metric_df = make_metric_df(y_validate.urea,
               y_validate.urea_pred_glm,
               'glm_poisson',
               metric_df)

In [None]:
metric_df

### Polynomial Regression
Using sklearn.preprocessing.PolynommialFeatures() + sklearn.linear_model.LinearRegression()
1. Create the new features, based on value indicated for degree for train, validate & test.
2. Fit the Linear Regression model
3. Predict using the transformed (squared or cubed, e.g.) features
4. Evaluate using RMSE
5. Repeat predictions and evaluation for validation.
6. Compare RMSE train vs. validation. Overfitting?

*****************************************************
## PolynomialFeatures

In [None]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)

# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate)
X_test_degree2 =  pf.transform(X_test)

## LinearRegression

In [None]:
# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train_degree2, y_train.urea)

# predict train
y_train['urea_pred_lm2'] = lm2.predict(X_train_degree2)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.urea, y_train.urea_pred_lm2) ** (1/2)

# predict validate
y_validate['urea_pred_lm2'] = lm2.predict(X_validate_degree2)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.urea, y_validate.urea_pred_lm2) ** 0.5

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

In [None]:
metric_df = make_metric_df(y_validate.urea,
               y_validate.urea_pred_lm2,
               'quadratic',
               metric_df)

In [None]:
metric_df

## Evaluate
Plotting Actual vs. Predicted Values

In [None]:
# y_validate.head()
plt.figure(figsize=(16,8))
plt.plot(y_validate.urea, y_validate.urea_pred_mean, alpha=.5, color="gray", label='_nolegend_')
plt.plot(y_validate.urea, y_validate.urea, alpha=.5, color="blue", label='_nolegend_')

plt.scatter(y_validate.urea, y_validate.urea_pred_lm, 
            alpha=.5, color="red", s=100, label="Model: LinearRegression")
plt.scatter(y_validate.urea, y_validate.urea_pred_glm, 
            alpha=.5, color="yellow", s=100, label="Model: TweedieRegressor")
plt.scatter(y_validate.urea, y_validate.urea_pred_lm2, 
            alpha=.5, color="green", s=100, label="Model 2nd degree Polynomial")
plt.legend()
plt.xlabel("Actual urea Concentration")
plt.ylabel("Predicted Urea Concentration")
plt.title("Where are predictions more extreme? More modest?")

plt.show()

### Residual Plots: Plotting the Errors in Predictions

In [None]:
# y_validate.head()
plt.figure(figsize=(16,8))
plt.axhline(label="No Error")
plt.scatter(y_validate.urea, y_validate.urea_pred_lm - y_validate.urea , 
            alpha=.5, color="red", s=100, label="Model: LinearRegression")
plt.scatter(y_validate.urea, y_validate.urea_pred_glm - y_validate.urea, 
            alpha=.5, color="yellow", s=100, label="Model: TweedieRegressor")
plt.scatter(y_validate.urea, y_validate.urea_pred_lm2 - y_validate.urea, 
            alpha=.5, color="green", s=100, label="Model 2nd degree Polynomial")
plt.legend()
plt.xlabel("Actual Urea Concentration")
plt.ylabel("Residual/Error: Predicted Concentration - Actual Concentration")
plt.title("Do the size of errors change as the actual value changes?")
# plt.annotate("The polynomial model appears to overreact to noise", (2.0, -10))
# plt.annotate("The OLS model (LinearRegression)\n appears to be most consistent", (15.5, 3))
plt.show()

### Histograms

In [None]:
# plot to visualize actual vs predicted. 
plt.figure(figsize=(16,8))
plt.hist(y_validate.urea, color='blue', alpha=.5, label="Actual urea")
plt.hist(y_validate.urea_pred_lm, color='red', alpha=.5, label="Model: LinearRegression")
plt.hist(y_validate.urea_pred_glm, color='yellow', alpha=.5, label="Model: TweedieRegressor")
plt.hist(y_validate.urea_pred_lm2, color='green', alpha=.5, label="Model 2nd degree Polynomial")
plt.xlabel("Urea Concentration")
plt.ylabel("Number of Patients")
plt.title("Comparing the Distribution of Actual Urea Conentrations to Distributions of Predicted Urea Concentrations for the Top Models")
plt.legend()
plt.show()

In [None]:
# addendum: Comparing models DF:
metric_df[['model', 'RMSE_validate']]

### Model Selection & Out-of-Sample Evaluation
Model selected: lm (using LinearRegression)

In [None]:
y_test = pd.DataFrame(y_test)

# predict on test
y_test['urea_pred_lm'] = lm.predict(X_test)

# evaluate: rmse
rmse_test = mean_squared_error(y_test.urea, y_test.urea_pred_lm) ** (1/2)

print("RMSE for OLS Model using LinearRegression\nOut-of-Sample Performance: ", rmse_test)