In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

from Library.sb_utils import save_file

In [2]:
water_test_data = pd.read_csv('../Capstone Project/capstone_data_analysis_df.csv')
water_test_data.head().T

Unnamed: 0,0,1,2,3,4
station_id,12.0,12.0,12.0,12.0,12.0
latitude_Dissolved,37.8019,37.8019,37.8019,37.8019,37.8019
longitude_Dissolved,-121.6203,-121.6203,-121.6203,-121.6203,-121.6203
county_name,Alameda,Alameda,Alameda,Alameda,Alameda
sample_code,568.0,668.0,768.0,868.0,968.0
sample_depth,1.0,1.0,1.0,1.0,1.0
Dissolved_Oxygen,7.5,7.4,6.7,7.5,7.0
pH,7.6,7.0,7.5,7.7,7.7
Electrical_Conductance,278.0,325.0,438.0,470.0,415.0
Water_Temperature,21.1,24.4,23.3,20.6,21.1


In [3]:
len(water_test_data) * .7, len(water_test_data) * .3

(1108.8, 475.2)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(water_test_data
                                                    .drop(columns='Dissolved_Oxygen'), 
                                                    water_test_data.Dissolved_Oxygen, test_size=0.3, 
                                                    random_state=47)

In [5]:
X_train.shape, X_test.shape

((1108, 9), (476, 9))

In [6]:
y_train.shape, y_test.shape

((1108,), (476,))

In [7]:
X_train

Unnamed: 0,station_id,latitude_Dissolved,longitude_Dissolved,county_name,sample_code,sample_depth,pH,Electrical_Conductance,Water_Temperature
686,80.0,37.07420,-121.01510,Merced,1273.0,1.0,7.40,395.0,10.00
1119,390.0,39.93180,-120.52330,Plumas,973.0,1.0,8.30,95.0,13.30
826,412.0,37.06810,-121.08540,Merced,568.0,1.0,8.00,440.0,16.70
680,80.0,37.07420,-121.01510,Merced,1072.0,1.0,7.80,355.0,18.90
718,81.0,37.11270,-121.05960,Merced,973.0,9.0,8.70,400.0,20.00
...,...,...,...,...,...,...,...,...,...
691,80.0,37.07420,-121.01510,Merced,674.0,1.0,7.50,243.0,20.60
584,217.0,34.55880,-118.62890,Los Angeles,574.0,1.0,9.10,430.0,15.30
1288,47116.0,38.53580,-121.52060,Sacramento,815.0,1.0,7.62,121.0,22.85
327,426.0,39.53120,-121.57930,Butte,1074.0,2.0,7.50,90.0,12.80


In [8]:
names_list = ['county_name', 'sample_code', 'station_id']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((1108, 6), (476, 6))

In [9]:
X_train.dtypes

latitude_Dissolved        float64
longitude_Dissolved       float64
sample_depth              float64
pH                        float64
Electrical_Conductance    float64
Water_Temperature         float64
dtype: object

In [10]:
X_test.dtypes

latitude_Dissolved        float64
longitude_Dissolved       float64
sample_depth              float64
pH                        float64
Electrical_Conductance    float64
Water_Temperature         float64
dtype: object

In [11]:
train_mean = y_train.mean()
train_mean

9.069837545126354

In [12]:
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[9.06983755]])

In [13]:
def r_squared(y, ypred):
    
    ybar = np.sum(y) / len(y) 
    sum_sq_tot = np.sum((y - ybar)**2) 
    sum_sq_res = np.sum((y - ypred)**2) 
    R2 = 1.0 - sum_sq_res / sum_sq_tot
    return R2

In [14]:
y_tr_pred_ = train_mean * np.ones(len(y_train))
y_tr_pred_[:5]

array([9.06983755, 9.06983755, 9.06983755, 9.06983755, 9.06983755])

In [15]:
y_tr_pred = dumb_reg.predict(X_train)
y_tr_pred[:5]

array([9.06983755, 9.06983755, 9.06983755, 9.06983755, 9.06983755])

In [16]:
r_squared(y_train, y_tr_pred)

0.0

In [17]:
y_te_pred = train_mean * np.ones(len(y_test))
r_squared(y_test, y_te_pred)

-0.00371371239344942

In [18]:
def mae(y, ypred):
   
    abs_error = np.abs(y - ypred)
    mae = np.mean(abs_error)
    return mae

In [19]:
mae(y_train, y_tr_pred)

1.484753157215655

In [20]:
mae(y_test, y_te_pred)

1.4675807723811545

In [21]:
def mse(y, ypred):
    
    sq_error = (y - ypred)**2
    mse = np.mean(sq_error)
    return mse

In [22]:
mse(y_train, y_tr_pred)

4.2592254248719525

In [23]:
mse(y_test, y_te_pred)

4.121045136060002

In [24]:
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([2.06378909, 2.03003575])

In [25]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.0, -0.00371371239344942)

In [26]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(1.484753157215655, 1.4675807723811545)

In [27]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(4.2592254248719525, 4.121045136060002)

In [28]:
r2_score(y_train, y_tr_pred), r2_score(y_tr_pred, y_train)

(0.0, 0.0)

In [29]:
r2_score(y_test, y_te_pred), r2_score(y_te_pred, y_test)

(-0.00371371239344942, -3.265028337683804e+29)

In [30]:
r_squared(y_train, y_tr_pred), r_squared(y_tr_pred, y_train)

  R2 = 1.0 - sum_sq_res / sum_sq_tot


(0.0, -inf)

In [31]:
r_squared(y_test, y_te_pred), r_squared(y_te_pred, y_test)

(-0.00371371239344942, -3.265028337683804e+29)

In [32]:
X_defaults_median = X_train.median()
X_defaults_median

latitude_Dissolved         37.8019
longitude_Dissolved      -121.0646
sample_depth                1.0000
pH                          8.0000
Electrical_Conductance    280.0000
Water_Temperature          17.2000
dtype: float64

In [33]:
X_tr = X_train.fillna(X_defaults_median)
X_te = X_test.fillna(X_defaults_median)

In [34]:
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)

In [35]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [36]:
median_r2 = r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)
median_r2

(0.0, -0.00371371239344942)

In [37]:
median_mae = mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)
median_mae

(1.484753157215655, 1.4675807723811545)

In [38]:
median_mse = mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)
median_mse

(4.2592254248719525, 4.121045136060002)

In [39]:
X_defaults_mean = X_train.mean()
X_defaults_mean

latitude_Dissolved         37.706858
longitude_Dissolved      -120.650348
sample_depth                5.219765
pH                          8.017356
Electrical_Conductance    254.351492
Water_Temperature          16.709968
dtype: float64

In [40]:
X_tr = X_train.fillna(X_defaults_mean)
X_te = X_test.fillna(X_defaults_mean)

In [41]:
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)

In [42]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [43]:
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

In [44]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.38274305120910357, 0.341869332809019)

In [45]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(1.1365207787003406, 1.1143802746436862)

In [46]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(2.629036489969071, 2.702151172620581)

In [47]:
pipe = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler(), 
    LinearRegression()
)

In [48]:
type(pipe)

sklearn.pipeline.Pipeline

In [49]:
hasattr(pipe, 'fit'), hasattr(pipe, 'predict')

(True, True)

In [50]:
pipe.fit(X_train, y_train)

In [51]:
y_tr_pred = pipe.predict(X_train)
y_te_pred = pipe.predict(X_test)

In [52]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.38274305120910357, 0.341869332809019)

In [53]:
median_r2

(0.0, -0.00371371239344942)

In [54]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(1.1365207787003406, 1.1143802746436862)

In [55]:
median_mae

(1.484753157215655, 1.4675807723811545)

In [56]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(2.629036489969071, 2.702151172620581)

In [57]:
median_mse

(4.2592254248719525, 4.121045136060002)