# Initial Modeling

This notebook was used to determine whether or not there was a correlation between water quality conditions and the presence of zooplankton in the California wildlife zooplankton surveys. It was done using 25 samples as part of the EDA process and is not necessary to produce the final linear regression models.

In [29]:
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score

# from sklearn.preprocessing import StandardScaler
# implement in final model

In [6]:
df = pickle.load( open( "merged_df.p", "rb" ) )
df.columns

Index(['11_day_temp_c', '11_day_specific_conductance', '11_day_ph',
       '11_day_dissolved_o2_mg_l', '11_day_turbidity',
       '11_day_nitrate_nitrite', '11_day_organic_matter', '11_day_salinity',
       '11_day_dissolved_o2_percent_sat', 'sqr_11_day_temp_c',
       'sqr_11_day_specific_conductance', 'sqr_11_day_ph',
       'sqr_11_day_dissolved_o2_mg_l', 'sqr_11_day_turbidity',
       'sqr_11_day_nitrate_nitrite', 'sqr_11_day_organic_matter',
       'sqr_11_day_salinity', 'sqr_11_day_dissolved_o2_percent_sat',
       'Sum all Pseudodiaptomus Adult', 'Sum all Calanoid Copepodids (juv)',
       'Sum all Cyclopoid Copepodids (juv)', 'Sum all Copepod Nauplii',
       'Sum all Limnoithona Adult', 'Sum all Tortanus Adult',
       'Sum all Rotifers'],
      dtype='object')

going off of the heatmap from the EDA I can ditch 11_day_dissolved_o2_percent_sat and sqr_11_day_dissolved_o2_percent_sat

In [7]:
df.drop(columns = ['11_day_dissolved_o2_percent_sat','sqr_11_day_dissolved_o2_percent_sat'], inplace = True)

In [8]:
df.columns

Index(['11_day_temp_c', '11_day_specific_conductance', '11_day_ph',
       '11_day_dissolved_o2_mg_l', '11_day_turbidity',
       '11_day_nitrate_nitrite', '11_day_organic_matter', '11_day_salinity',
       'sqr_11_day_temp_c', 'sqr_11_day_specific_conductance', 'sqr_11_day_ph',
       'sqr_11_day_dissolved_o2_mg_l', 'sqr_11_day_turbidity',
       'sqr_11_day_nitrate_nitrite', 'sqr_11_day_organic_matter',
       'sqr_11_day_salinity', 'Sum all Pseudodiaptomus Adult',
       'Sum all Calanoid Copepodids (juv)',
       'Sum all Cyclopoid Copepodids (juv)', 'Sum all Copepod Nauplii',
       'Sum all Limnoithona Adult', 'Sum all Tortanus Adult',
       'Sum all Rotifers'],
      dtype='object')

In [9]:
x = df[['11_day_temp_c', '11_day_specific_conductance', '11_day_ph',
       '11_day_dissolved_o2_mg_l', '11_day_turbidity',
       '11_day_nitrate_nitrite', '11_day_organic_matter', '11_day_salinity',
       'sqr_11_day_temp_c', 'sqr_11_day_specific_conductance', 'sqr_11_day_ph',
       'sqr_11_day_dissolved_o2_mg_l', 'sqr_11_day_turbidity',
       'sqr_11_day_nitrate_nitrite', 'sqr_11_day_organic_matter',
       'sqr_11_day_salinity']]
y = df[['Sum all Pseudodiaptomus Adult',
       'Sum all Calanoid Copepodids (juv)',
       'Sum all Cyclopoid Copepodids (juv)', 'Sum all Copepod Nauplii',
       'Sum all Limnoithona Adult', 'Sum all Tortanus Adult',
       'Sum all Rotifers']]

For next iteration:
- Do a linear regression using lasso or ridge
- Try with the different alpha values to get coefficients
- Put together a formula that says what the bell-curve weight should be centered on and the width for each of the features and the formula that shows the prediction for each of the fishes

In [12]:
model = LinearRegression(fit_intercept=True)
model.fit(x[['11_day_temp_c',
              '11_day_specific_conductance',
              '11_day_ph',
              '11_day_dissolved_o2_mg_l',
              '11_day_turbidity',
              '11_day_nitrate_nitrite',
              '11_day_organic_matter',
              '11_day_salinity']],
          y['Sum all Pseudodiaptomus Adult'])

LinearRegression()

In [14]:
list(zip(['11_day_temp_c', '11_day_specific_conductance', '11_day_ph',
       '11_day_dissolved_o2_mg_l', '11_day_turbidity',
       '11_day_nitrate_nitrite', '11_day_organic_matter', '11_day_salinity'], model.coef_))

[('11_day_temp_c', -32.373211668209244),
 ('11_day_specific_conductance', -16.29401254744564),
 ('11_day_ph', 2183.9872467072023),
 ('11_day_dissolved_o2_mg_l', -110.76274631001881),
 ('11_day_turbidity', 349.4723275546191),
 ('11_day_nitrate_nitrite', 23562.55290747747),
 ('11_day_organic_matter', -238.47594609101702),
 ('11_day_salinity', -16242.974644808735)]

In [28]:
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true)) 

test_set_pred = model.predict(x[['11_day_temp_c',
              '11_day_specific_conductance',
              '11_day_ph',
              '11_day_dissolved_o2_mg_l',
              '11_day_turbidity',
              '11_day_nitrate_nitrite',
              '11_day_organic_matter',
              '11_day_salinity']])

mae(y['Sum all Pseudodiaptomus Adult'], test_set_pred)

260.94903907666895

In [31]:
r2_score(y['Sum all Pseudodiaptomus Adult'], test_set_pred)

0.7995732575491653