# Prediction & Corelation between SST and NPP Data

In [None]:
import xarray as xr
import matplotlib.pyplot as plt
from google.colab import auth
from google.cloud import bigquery
import pandas as pd
import numpy as np


In [None]:
!pip install copernicusmarine

# Step 1: Install the Copernicus Marine Toolbox
import copernicusmarine

# Step 2: Login with Your Copernicus Marine Credentials
copernicusmarine.login(username="samikshakhare22@gmail.com", password="Counter@1")


# Data Extraction



### To cover Barbados and its nearby coastal waters (especially beaches) for analyzing Sea Surface Temperature (SST) and Net Primary Productivity (NPP), a safe and effective geographic bounding box

In [None]:
# Get SST data: https://data.marine.copernicus.eu/product/SST_GLO_SST_L4_NRT_OBSERVATIONS_010_001/download

copernicusmarine.subset(
   dataset_id = "METOFFICE-GLO-SST-L4-NRT-OBS-SST-V2",
   variables = ["analysed_sst"],
   start_datetime = "2011-01-01T00:00:00",
   end_datetime = "2024-12-31T23:59:59",
   minimum_longitude = -59.7,
   maximum_longitude = -59.3,
   minimum_latitude = 12.95,
   maximum_latitude = 13.3,
   maximum_depth = 2,
   output_filename = "sst_data.nc",
   output_directory = "copernicus-data"
)

In [None]:
# Get NPP Data:  https://data.marine.copernicus.eu/product/GLOBAL_MULTIYEAR_BGC_001_033/description

copernicusmarine.subset(
   dataset_id="cmems_mod_glo_bgc_my_0.083deg-lmtl_PT1D-i",
   variables = [
    "net_primary_productivity_of_biomass_expressed_as_carbon_in_sea_water",
    "euphotic_zone_depth",
    "mass_content_of_zooplankton_expressed_as_carbon_in_sea_water"
],
   start_datetime = "2011-01-01T00:00:00",
   end_datetime = "2024-12-31T23:59:59",
   minimum_longitude = -59.7,
   maximum_longitude = -59.3,
   minimum_latitude = 12.95,
   maximum_latitude = 13.3,
   maximum_depth = 2,
   output_filename = "npp_data.nc",
   output_directory = "copernicus-data"
)


## Data Exploration

In [None]:
import xarray as xr

# Open the NetCDF file
sst_ds = xr.open_dataset('/content/copernicus-data/sst_data.nc')
npp_ds = xr.open_dataset('/content/copernicus-data/npp_data.nc')

In [None]:
# Rename using Dataset.rename
sst_ds = sst_ds.rename({'analysed_sst': 'sst'})
print(sst_ds.data_vars)

In [None]:
# Renamed variable name from dataset
npp_var = npp_ds[['npp', 'zeu','zooc']]

# List all variables in the dataset
print(npp_var.data_vars)

In [None]:
# Convert to DataFrame
npp_df = npp_var.to_dataframe().reset_index()
sst_df = sst_ds['sst'].to_dataframe().reset_index()

In [None]:
sst_df.head()

In [None]:
npp_df.head()

## Handle missing values

In [None]:
sst_df.shape

In [None]:
sst_df.isnull().sum()

In [None]:
# Drop missing values
sst_df = sst_df.dropna()
sst_df.shape

In [None]:
npp_df.isnull().sum()

In [None]:
npp_df.shape

In [None]:
# Drop missing values
npp_df = npp_df.dropna()
npp_df.shape

## Merge SST and NPP datasets

In [None]:
def truncate_one_decimal(x):
    return float(str(x)[:str(x).find('.') + 2])

sst_df['latitude'] = sst_df['latitude'].apply(truncate_one_decimal)
sst_df['longitude'] = sst_df['longitude'].apply(truncate_one_decimal)
sst_df.head()

In [None]:
def truncate_one_decimal(x):
    return float(str(x)[:str(x).find('.') + 2])

npp_df['latitude'] = npp_df['latitude'].apply(truncate_one_decimal)
npp_df['longitude'] = npp_df['longitude'].apply(truncate_one_decimal)

In [None]:
npp_df.head()

In [None]:
merged_df = pd.merge(
    sst_df,
    npp_df,
    on=['time', 'latitude', 'longitude'],
    how='inner'
)


In [None]:
print(merged_df.head())
print(merged_df.shape)


# Machine Learning Model

### Regression to predict NPP (Net Primary Productivity) using SST and other variables.

In [None]:
# 1. Select Features (X) and Target (y):

features = ['sst', 'zeu', 'zooc']
target = 'npp'

X = merged_df[features]
y = merged_df[target]


In [None]:
# 2. Train/Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 3. Train a Regression Model

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
# 4. Evaluate the Model

from sklearn.metrics import r2_score, mean_absolute_error

y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


The R² Score (R-squared), also known as the coefficient of determination, tells you how well your machine learning model explains the variability of the target variable.

Higher R² = better model fit.

In [None]:
# 5. Visualize Actual vs Predicted

import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # ideal line
plt.xlabel("Actual NPP")
plt.ylabel("Predicted NPP")
plt.title("Actual vs Predicted NPP")
plt.axis('equal')  # <-- ensures same scaling
plt.show()



Explaination:

Points near the red line = good predictions.

Points below the line = model underpredicted NPP.

Points above the line = model overpredicted NPP.

Saturation effect: Predictions start to flatten near 1000, even as actual NPP continues increasing.

This may indicate the model is not capturing high-NPP behavior well (e.g., it's biased toward the mean or lacks nonlinear expressiveness).



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a correlation heatmap between features and target
plt.figure(figsize=(8, 6))
sns.heatmap(merged_df[['sst', 'zeu', 'zooc', 'npp']].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap: SST, ZEU, ZOOC vs NPP")
plt.tight_layout()
plt.show()


### Key Observations:

---

**1. Strong Negative Correlation Between ZEU and NPP (−0.91)**  
→ This suggests that when euphotic depth (ZEU) increases (i.e., light penetrates deeper), surface productivity (NPP) tends to decrease.

*Justification:*  
- High ZEU often means clearer, nutrient-poor waters → less phytoplankton growth (lower NPP).  
- In tropical regions like Barbados, sunlight is abundant year-round, so **light isn’t the limiting factor**.  
- **Nutrient availability** (e.g., nitrogen, phosphorus from deep water) becomes the main driver of productivity.  

So:  
- More light (deeper ZEU) ≠ more productivity  
- More nutrients → more productivity  
- Hence, negative correlation between ZEU and NPP

---

**2. Positive Correlation Between ZOOC and NPP (+0.32)**  
→ Where there's more primary productivity (NPP), there’s generally more zooplankton feeding on it.

*Supports the trophic link:*  
- Higher NPP → more food for zooplankton → higher zooplankton biomass

---

**3. Weak Negative Correlation Between SST and NPP (−0.23)**  
→ Warmer sea surface temperatures slightly reduce productivity.

*Matches established findings:*  
- Warmer waters = less nutrient mixing → reduced phytoplankton growth  
- Especially relevant in stratified tropical waters like the Caribbean

📄 Reference: [Nature article on SST and productivity](https://www.nature.com/articles/s41598-018-20560-5#:~:text=In%20the%20tropics%2C%20thermal%20stratification,layer%2C%20ultimately%20limiting%20phytoplankton%20growth.)

---
