# Prediction Model for NPP

In [None]:
import xarray as xr
import matplotlib.pyplot as plt
from google.colab import auth
from google.cloud import bigquery
import pandas as pd


In [None]:
!pip install copernicusmarine
# Step 1: Install the Copernicus Marine Toolbox
import copernicusmarine

# Step 2: Login with Your Copernicus Marine Credentials
copernicusmarine.login(username="samikshakhare@gmail.com", password="Counter@1")


### Extract data from 2011 to 2023 from Copernicus

In [None]:
# https://data.marine.copernicus.eu/product/GLOBAL_MULTIYEAR_BGC_001_033/description

# Coordinates Near Crane Beach, small ocean patch ~6 km east of Crane Beach.
# Sargassum is a surface-floating algae, use: 05 depth

copernicusmarine.subset(
   dataset_id="cmems_mod_glo_bgc_my_0.083deg-lmtl_PT1D-i",
   variables = [
    "net_primary_productivity_of_biomass_expressed_as_carbon_in_sea_water",
    "euphotic_zone_depth",
    "mass_content_of_zooplankton_expressed_as_carbon_in_sea_water"
],
   start_datetime = "2011-01-01T00:00:00",
   end_datetime = "2024-12-31T23:59:59",
   minimum_longitude = -59.50,
   maximum_longitude = -59.44,
   minimum_latitude = 13.10,
   maximum_latitude = 13.12,
   minimum_depth = 0,
   maximum_depth = 5,
   output_filename = "npp_data.nc",
   output_directory = "copernicus-data"
)


In [None]:
import xarray as xr

# Open the NetCDF file
ds = xr.open_dataset('/content/copernicus-data/npp_data.nc')


# Explore the dataset
print(ds)

In [None]:
# Use correct variable name from dataset
npp_var = ds[['npp', 'zeu','zooc']]


# Convert to DataFrame
df = npp_var.to_dataframe().reset_index()

# Drop missing values (optional)
df = df.dropna()

# Preview the DataFrame
df.head()

In [None]:
df.shape

In [None]:
# Drop zeu and zooc columns
npp_df_cleaned = df.drop(columns=['zeu', 'zooc'])

npp_df_cleaned.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

In [None]:
# # Use your actual dataset: npp_df_cleaned
# # Let's follow the same steps using your real data

# # Step 1: Prepare date features if not already added
# npp_df_cleaned['time'] = pd.to_datetime(npp_df_cleaned['time'])
# npp_df_cleaned['year'] = npp_df_cleaned['time'].dt.year
# npp_df_cleaned['month'] = npp_df_cleaned['time'].dt.month
# npp_df_cleaned['dayofyear'] = npp_df_cleaned['time'].dt.dayofyear

# # Step 2: Train the model
# X = npp_df_cleaned[['year', 'month', 'dayofyear']]
# y = npp_df_cleaned['npp']
# model = RandomForestRegressor(n_estimators=100, random_state=42)
# model.fit(X, y)

# # Step 3: Create future dates (2024–2030)
# future_dates = pd.date_range(start='2024-01-01', end='2030-12-31', freq='D')
# future_df = pd.DataFrame({'time': future_dates})
# future_df['year'] = future_df['time'].dt.year
# future_df['month'] = future_df['time'].dt.month
# future_df['dayofyear'] = future_df['time'].dt.dayofyear

# # Step 4: Predict NPP for future dates
# X_future = future_df[['year', 'month', 'dayofyear']]
# future_df['predicted_npp'] = model.predict(X_future)

# # Step 5: Calculate average NPP for each year
# avg_npp_historic = npp_df_cleaned.groupby('year')['npp'].mean().reset_index()
# avg_npp_future = future_df.groupby('year')['predicted_npp'].mean().reset_index()

# # Step 6: Plot historical and predicted average NPP
# plt.figure(figsize=(10, 6))
# plt.plot(avg_npp_historic['year'], avg_npp_historic['npp'], label='Avg NPP (2011–2023)', marker='o')
# plt.plot(avg_npp_future['year'], avg_npp_future['predicted_npp'], label='Predicted Avg NPP (2024–2030)', marker='o', linestyle='--')
# plt.xlabel('Year')
# plt.ylabel('Average NPP')
# plt.title('Average Annual NPP: Historical (2011–2023) vs Predicted (2024–2030)')
# plt.grid(True)
# plt.legend()
# plt.tight_layout()
# plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming npp_df_cleaned is already available
npp_df_cleaned['time'] = pd.to_datetime(npp_df_cleaned['time'])
npp_df_cleaned['year'] = npp_df_cleaned['time'].dt.year

# Step 1: Average NPP per year for historical data
annual_npp = npp_df_cleaned.groupby('year')['npp'].mean().reset_index()

# Step 2: Train a model on year vs average NPP
X = annual_npp[['year']]
y = annual_npp['npp']

model = LinearRegression()
model.fit(X, y)

# Step 3: Predict future NPP from 2024 to 2030
future_years = pd.DataFrame({'year': np.arange(2024, 2031)})
future_predictions = model.predict(future_years)

# Step 4: Combine historical and predicted data
future_years['npp'] = future_predictions
combined_df = pd.concat([annual_npp, future_years])

# Step 5: Plot
plt.figure(figsize=(10, 6))
plt.plot(annual_npp['year'], annual_npp['npp'], marker='o', label='Avg NPP (2011–2023)')
plt.plot(future_years['year'], future_years['npp'], marker='o', linestyle='--', label='Predicted Avg NPP (2024–2030)')
plt.title('Linear Model: Average Annual NPP (2011–2030)')
plt.xlabel('Year')
plt.ylabel('Average NPP')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Ensure 'time' is datetime
npp_df_cleaned['time'] = pd.to_datetime(npp_df_cleaned['time'])

# Step 1: Calculate yearly average NPP from 2011 to 2023
npp_df_cleaned['year'] = npp_df_cleaned['time'].dt.year
annual_npp = npp_df_cleaned.groupby('year')['npp'].mean().reset_index()

# Step 2: Train a Polynomial Regression Model (degree 3)
X = annual_npp[['year']]
y = annual_npp['npp']
model = make_pipeline(PolynomialFeatures(degree=44), LinearRegression())
model.fit(X, y)

# Step 3: Predict for future years 2024–2030
future_years = pd.DataFrame({'year': list(range(2024, 2031))})
future_predictions = model.predict(future_years)

# Step 4: Plot historical and predicted NPP
plt.figure(figsize=(10, 6))
plt.plot(annual_npp['year'], annual_npp['npp'], marker='o', label='Avg NPP (2011–2023)')
plt.plot(future_years['year'], future_predictions, marker='o', linestyle='--', label='Predicted Avg NPP (2024–2030)')
plt.title("Polynomial Model: Average Annual NPP (2011–2030)")
plt.xlabel("Year")
plt.ylabel("Average NPP")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
