## Data Analysis

#### This notebook is dedicated to performing correlation/MLR calculations on MS274 final project data

What I need to perform analysis:
1. Fix issue w/ 2020 chlorophyll data - download as csv
2. Get all chloraphyll data into one csv (currently have 1 csv for each year)

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [None]:
jellywatch = np.genfromtxt(jelly_folder+'\\'+'Jellywatch_sightings_edit.csv', delimiter=',')
sst = np.genfromtxt(sst_folder+'\\'+'mb_SST.csv', delimiter=',')
chlor = np.genfromtxt(chlor_folder+'\\'+'CSV NAME ONCE ALL COMBINED.csv', delimiter=',')

In [None]:
good_indices = ~np.isnan(chlor[:,1])

# subset each timeseries to the non-NaN indicies
sst_subset = sst[good_indices, :]
chlor_subset = chlor[good_indices, :]
jellywatch_subset = jellywatch[good_indices, :]

In [None]:
X = np.column_stack([np.ones_like(sst_subset[:,1]), 
                     sst_subset[:,1], 
                     jellywatch_subset[:,1]])

# form the y variable - reshape it just in case
y = jellywatch_subset[:,1]
y= np.reshape(y, (len(y), 1))
print(np.shape(y))

In [None]:
model = sm.OLS(y, X).fit()

In [None]:
model.summary()

In [None]:
coefficients = model.params
print(coefficients)

intercept = coefficients[0]
slope_sst = coefficients[1]
slope_jellies = coefficients[2]


# retrieve the R^2 value from the models.rsquared attribute
r2 = model.rsquared

In [None]:
modeled_values = slope_sst * sst_subset[:,1] + slope_jellies * jellywatch_subset[:,1] + intercept

In [None]:
plt.plot(modeled_values, chlor_subset[:,1], 'g.')

# label the x and y axis
plt.xlabel('Modeled Chl. (mg/cm$^3$)')
plt.ylabel('Observed Chl. (mg/cm$^3$)')

# add a label for the R^2
plt.text(1.2, 0.6, 'R$^2$ =' +'{:.3f}'.format(r2))

# plot the 1-1 line
plt.plot([0, 1.8], [0, 1.8], 'k--')

# make the axes equal
plt.gca().set_xlim([0, 1.8])
plt.gca().set_ylim([0, 1.8])