In [1]:
import pathlib

import ee
import geopandas as gpd
import numpy as np
import pandas as pd

import agrigee_lite as agl

np.set_printoptions(suppress=True)
np.set_printoptions(precision=2)
pd.set_option('display.float_format', '{:.2f}'.format)

# pandas show all columns
pd.set_option('display.max_columns', None)

%load_ext autoreload
%autoreload 2

In [2]:
sat = agl.sat.Sentinel2()

In [3]:
ee.Initialize(opt_url="https://earthengine-highvolume.googleapis.com", project="ee-paulagibrim")

In [4]:
gdf = gpd.read_parquet("data/mt_crops.parquet")

gdf["start_date"] = pd.to_datetime(gdf.year.apply(lambda year: f"{year-1}-10-01"))
gdf["end_date"] = pd.to_datetime(gdf.year.apply(lambda year: f"{year}-10-01"))

In [5]:
gdf = gdf[gdf.start_date >= sat.startDate].reset_index(drop=True)

In [6]:
gdf = gdf[gdf.ha>=6.25].reset_index(drop=True)

In [7]:
df = agl.get.multiple_sits(gdf.iloc[0:1000], sat)

Simplifying clusters: 100%|██████████| 1/1 [00:00<00:00,  1.91it/s]
Downloading multiple sits: 100%|██████████| 1000/1000 [02:14<00:00,  7.46it/s]


In [12]:
df.dtypes

indexnum              int64
timestamp    datetime64[ns]
blue                float16
green               float16
red                 float16
re1                 float16
re2                 float16
re3                 float16
nir                 float16
re4                 float16
swir1               float16
swir2               float16
dtype: object

In [10]:
from tqdm.std import tqdm

In [None]:
grouped = df.groupby('indexnum')

max_seq_len = grouped.size().max()
num_samples = len(grouped)
num_bands = len(df.columns) - 2

X = np.zeros((num_samples, max_seq_len, num_bands), dtype=np.float16)
T = np.zeros((num_samples, max_seq_len), dtype='datetime64[D]')

for idx, group in tqdm(grouped):
    group_sorted = group.sort_values('timestamp')
    seq_len = len(group_sorted)

    X[idx, :seq_len, :] = group_sorted.drop(columns=['timestamp', 'indexnum']).to_numpy(dtype=np.float16)
    T[idx, :seq_len] = group_sorted['timestamp'].to_numpy().astype('datetime64[D]')

100%|██████████| 1000/1000 [00:00<00:00, 2636.31it/s]


In [24]:
T

array([['2019-10-04', '2019-10-19', '2019-10-24', ..., '1970-01-01',
        '1970-01-01', '1970-01-01'],
       ['2020-10-03', '2020-10-08', '2020-10-23', ..., '1970-01-01',
        '1970-01-01', '1970-01-01'],
       ['2021-10-03', '2021-10-08', '2021-10-13', ..., '1970-01-01',
        '1970-01-01', '1970-01-01'],
       ...,
       ['2020-10-03', '2020-10-08', '2020-10-13', ..., '1970-01-01',
        '1970-01-01', '1970-01-01'],
       ['2021-10-03', '2021-10-08', '2021-10-13', ..., '1970-01-01',
        '1970-01-01', '1970-01-01'],
       ['2022-10-03', '2022-10-08', '2022-11-02', ..., '1970-01-01',
        '1970-01-01', '1970-01-01']], dtype='datetime64[D]')

In [20]:
np.zeros(5, dtype='datetime64[D]')

array(['1970-01-01', '1970-01-01', '1970-01-01', '1970-01-01',
       '1970-01-01'], dtype='datetime64[D]')

In [None]:
agl.__version__

In [None]:
agl.vis.multiple_sits(gdf[gdf.crop_name=="Soybean"].sample(10, random_state=42).reset_index(drop=True), "ndvi", agl.sat.Sentinel2(bands=["red", "nir"]))

In [None]:
gdf = gdf.sample(10000, random_state=42).reset_index(drop=True)

In [None]:
agl.get.multiple_sits(gdf, sat, chunksize=1000).sort_values("indexnum", kind="stable")

In [None]:
agl.get.multiple_sits(gdf.sample(100).reset_index(drop=True), sat, chunksize=1000, reducers=["median", "std"])

In [None]:
row = gdf.iloc[0]
agl.vis.sits(row.geometry, row.start_date, row.end_date, agl.sat.Sentinel2(bands=["nir", "red"]), "evi2")

In [None]:
print()

In [None]:
# gdf = gpd.read_parquet("data_new/BA.parquet")

# gdf.crop_class.value_counts()

In [None]:
results = gdf[filter(lambda x: x.startswith("s2sr"), gdf.columns.to_list())]

In [None]:
from agrigee_lite.misc import wide_to_long_dataframe

In [None]:
gdf.crop_class.value_counts()

In [None]:
agl.vis.multiple_sits(gdf[gdf.crop_class=="Soybean"].sample(50, random_state=25).reset_index(drop=True), "evi2", sat)

In [None]:
agl.vis.multiple_sits(gdf[gdf.crop_class=="Sugar Cane"].sample(50, random_state=25).reset_index(drop=True), "evi2", sat)

In [None]:
agl.vis.multiple_sits(gdf[gdf.crop_class=="Forest Plantation"].sample(50, random_state=25).reset_index(drop=True), "evi2", sat)

In [None]:
agl.vis.multiple_sits(gdf[gdf.crop_class=="Sugar Cane"].sample(50, random_state=25).reset_index(drop=True), "evi2", sat)

In [None]:
agl.vis.multiple_sits(gdf[gdf.crop_class=="Other Temporary Crops"].sample(50, random_state=25).reset_index(drop=True), "evi2", sat)

In [None]:
agl.vis.multiple_sits(gdf[gdf.crop_class=="Cotton"].sample(50, random_state=25).reset_index(drop=True), "evi2", sat)

In [None]:
gdf.crop_class.value_counts()

In [None]:
gdf[gdf.crop_class=="Sugar Cane"].sample(1, random_state=42)

In [None]:
gdf[gdf.crop_class=="Mosaic of Uses"].sample(1, random_state=42)

In [None]:
gdf = gpd.read_parquet("data/mt_crops.parquet")

In [None]:
gdf.crop_name.value_counts()

In [None]:
for input_file in pathlib.Path("data").glob("*.parquet"):
    gdf = gpd.read_parquet(input_file)
    results = agl.get.multiple_sits(gdf, sat, subsampling_max_pixels=1000, initial_concurrency=40)
    gdf = gdf.join(results)
    gdf.to_parquet(f"data_new/{gdf.stem}.parquet")

In [None]:
gdf = gpd.read_parquet("MG_crops.parquet")

gdf.sample(5, random_state=42)

In [None]:
gdf["start_date"] = pd.to_datetime(gdf["start_date"])
gdf["end_date"] = pd.to_datetime(gdf["end_date"])

In [None]:
sampled_gdf = gdf.sample(250, random_state=42).reset_index(drop=True)

results = agl.get.multiple_sits(sampled_gdf, agl.sat.Landsat8(bands=["red"]), date_types=["timestamp"], force_redownload=True)

In [None]:
sampled_gdf_w_results = sampled_gdf.join(results)

In [None]:
sampled_gdf_w_results[sampled_gdf_w_results.l8sr_observations==0].ha.max()