In [7]:
import os
import sys
import sqlite3
import numpy as np
import pandas as pd
import geopandas as gp
import plotly.express as px
import matplotlib.pyplot as plt
import scipy.stats as scs

import pyet
import pyeto
import spei

In [8]:
sys.path.append('/home/pooya/w/DroughtMonitoringIran/')

DATA_PATH = "./assets/data/GEE_CSV_Exports/"
DATABASE_PATH = "./database/database.db"

### PET MODIS

In [9]:
# Read All Stations Except for Babolsar, Nowshahr, Bandar-e-anzali
file_name = "MOD16A2GF_Monthly_PET_2000_2024.csv"
pet_modise = pd.read_csv(filepath_or_buffer=DATA_PATH + file_name, na_values=-999)
pet_modise = pet_modise[["St_ID", "date", "mean"]]

pet_modise.rename(
    columns={
        "St_ID": "Station_ID",
        "date": "Date",
        "mean": "PET_MOD16A2GF",
    },
    inplace=True
)

pet_dataset = pet_modise.copy()

pet_dataset['Date'] = pd.to_datetime(pet_dataset['Date'])
pet_dataset.sort_values(by=["Station_ID", "Date"], inplace=True)
pet_dataset.reset_index(drop=True, inplace=True)

expanded_rows = []

for _, row in pet_dataset.iterrows():
    for i in range(8):
        expanded_rows.append({
            'Station_ID': row['Station_ID'],
            'Date': row['Date'] - pd.Timedelta(days=i),
            'PET_MOD16A2GF': row['PET_MOD16A2GF'] / 8
        })

pet_daily_dataset = pd.DataFrame(expanded_rows)
pet_daily_dataset["Date"] = pd.to_datetime(pet_daily_dataset["Date"])
pet_daily_dataset = pet_daily_dataset.sort_values(['Station_ID', 'Date']).reset_index(drop=True)

pet_daily_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211968 entries, 0 to 211967
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Station_ID     211968 non-null  int64         
 1   Date           211968 non-null  datetime64[ns]
 2   PET_MOD16A2GF  211752 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 4.9 MB


### Daily to Monthly

In [10]:
tmp = pet_daily_dataset.copy()
tmp["Date"] = tmp["Date"].dt.to_period("M").astype(str)

pet_monthly_dataset = tmp.groupby(["Station_ID", "Date"])["PET_MOD16A2GF"].sum(min_count=25).reset_index()
pet_monthly_dataset["Date"] = pd.to_datetime(pet_monthly_dataset["Date"]) + pd.offsets.MonthEnd(0)

print(pet_monthly_dataset.info())
pet_monthly_dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6936 entries, 0 to 6935
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Station_ID     6936 non-null   int64         
 1   Date           6936 non-null   datetime64[ns]
 2   PET_MOD16A2GF  6903 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 162.7 KB
None


Unnamed: 0,Station_ID,Date,PET_MOD16A2GF
0,40709,1999-12-31,
1,40709,2000-01-31,44.9250
2,40709,2000-02-29,72.2375
3,40709,2000-03-31,127.6375
4,40709,2000-04-30,165.7000
...,...,...,...
6931,99361,2023-08-31,239.2750
6932,99361,2023-09-30,176.6500
6933,99361,2023-10-31,123.0125
6934,99361,2023-11-30,98.8000


In [11]:
df = pet_monthly_dataset.copy()
df['Month'] = df['Date'].dt.month
monthly_avg = df.groupby(['Station_ID', 'Month'])[['PET_MOD16A2GF']].mean(numeric_only=True)

def replace_with_avg(row):
    if pd.isna(row['PET_MOD16A2GF']):
        return monthly_avg.loc[(row['Station_ID'], row['Month']), 'PET_MOD16A2GF']
    else:
        return row['PET_MOD16A2GF']

df['PET_MOD16A2GF'] = df.apply(lambda row: replace_with_avg(row), axis=1)

df.drop(columns=['Month'], inplace=True)

pet_monthly_dataset = df.copy()

pet_monthly_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6936 entries, 0 to 6935
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Station_ID     6936 non-null   int64         
 1   Date           6936 non-null   datetime64[ns]
 2   PET_MOD16A2GF  6936 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 162.7 KB


In [12]:
# Water-Year: 1385-86 to 1402-03

min_date = '2006-09-01' # 18 Years - 24 Stations - Start: 2006-09-23
max_date = '2024-06-30' # End: 2024-09-21

pet_monthly_dataset = pet_monthly_dataset\
    .query("Date >= @min_date and Date <= @max_date")

pet_monthly_dataset

Unnamed: 0,Station_ID,Date,PET_MOD16A2GF
81,40709,2006-09-30,125.7250
82,40709,2006-10-31,80.7750
83,40709,2006-11-30,44.0000
84,40709,2006-12-31,35.0250
85,40709,2007-01-31,60.4000
...,...,...,...
6931,99361,2023-08-31,239.2750
6932,99361,2023-09-30,176.6500
6933,99361,2023-10-31,123.0125
6934,99361,2023-11-30,98.8000


In [13]:
conn = sqlite3.connect(DATABASE_PATH)

pet_monthly_dataset.to_sql('gee_pet_monthly', conn, if_exists='replace', index=False)

conn.commit()

conn.close()