In [None]:
import os
import warnings
import logging
import sys
import pickle
import numpy as np
import pandas as pd
import geopandas as gpd
import dotenv
import pyet
from linearmodels.panel import PanelOLS, RandomEffects
from linearmodels.panel import compare



from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from linearmodels.panel import PanelOLS
import statsmodels.api as sm

# Load environment variables from .env file
dotenv.load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)

# Suppress warnings
warnings.filterwarnings("ignore")

# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_colwidth', None)

In [7]:
# Load Data From Pickle
with open('../../data/Data_Iran_ETo_1983_2022.pkl', 'rb') as f:
    data = pickle.load(f)
    logging.info("Data loaded from pickle file.")

INFO:root:Data loaded from pickle file.


In [8]:
data

Unnamed: 0,date,station_id,station_name,region_id,region_name,lat,lon,station_elevation,tmax,tmin,tm,umax,umin,um,ffm,sshn,rrr24,Penman,FAO-56,Priestley-Taylor,Kimberly-Penman,Thom-Oliver,Blaney-Criddle,Hamon,Romanenko,Linacre,Haude,Turc,Jensen-Haise,Mcguinness-Bordne,Hargreaves,FAO-24,Abtew,Makkink,Oudin,0
0,1983-01-01 12:00:00,40752,Karaj,ALKK,Alborz,35.81,50.95,1292.90,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1983-01-02 12:00:00,40752,Karaj,ALKK,Alborz,35.81,50.95,1292.90,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1983-01-03 12:00:00,40752,Karaj,ALKK,Alborz,35.81,50.95,1292.90,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1983-01-04 12:00:00,40752,Karaj,ALKK,Alborz,35.81,50.95,1292.90,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1983-01-05 12:00:00,40752,Karaj,ALKK,Alborz,35.81,50.95,1292.90,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6252647,2022-12-26 12:00:00,99440,Kahak,QOQM,Qom,34.40,50.87,1403.10,10.50,1.30,4.60,81.00,44.00,66.38,2.00,6.40,0.00,0.97,1.23,0.83,1.56,1.11,0.62,0.60,2.01,1.44,1.15,0.87,0.75,0.97,1.07,1.11,2.10,1.32,0.66,
6252648,2022-12-27 12:00:00,99440,Kahak,QOQM,Qom,34.40,50.87,1403.10,7.40,-1.40,3.20,90.00,57.00,77.12,2.12,0.70,0.40,0.74,0.85,0.69,1.11,0.76,0.49,0.55,1.23,1.54,0.64,0.38,0.30,0.83,0.98,0.32,1.04,0.62,0.56,
6252649,2022-12-28 12:00:00,99440,Kahak,QOQM,Qom,34.40,50.87,1403.10,6.60,1.90,2.60,92.00,55.00,73.25,2.50,2.80,0.01,0.81,0.98,0.72,1.37,0.83,0.43,0.53,1.29,1.09,0.70,0.41,0.38,0.77,0.70,0.57,1.43,0.84,0.52,
6252650,2022-12-29 12:00:00,99440,Kahak,QOQM,Qom,34.40,50.87,1403.10,11.50,-3.20,3.10,64.00,37.00,51.38,2.75,9.00,0.00,1.14,1.67,0.80,2.41,1.32,0.48,0.55,2.45,1.35,1.78,0.76,0.75,0.82,1.27,1.55,2.60,1.55,0.56,


In [9]:
# Filter stations with at least 20 * 365 rows of data of FAO-56 column (group by station_id and region_id)
min_rows = 20 * 365
df = data.groupby(['region_id', 'station_id']).filter(lambda x: x['FAO-56'].count() >= min_rows)
logging.info(f"Filtered valid stations. Remaining stations: {df[['region_id', 'station_id']].nunique()}")
df.info()

INFO:root:Filtered valid stations. Remaining stations: region_id      31
station_id    146
dtype: int64


<class 'pandas.core.frame.DataFrame'>
Index: 2132914 entries, 0 to 6194215
Data columns (total 36 columns):
 #   Column             Dtype         
---  ------             -----         
 0   date               datetime64[ns]
 1   station_id         object        
 2   station_name       object        
 3   region_id          object        
 4   region_name        object        
 5   lat                float64       
 6   lon                float64       
 7   station_elevation  float64       
 8   tmax               float64       
 9   tmin               float64       
 10  tm                 float64       
 11  umax               float64       
 12  umin               float64       
 13  um                 float64       
 14  ffm                float64       
 15  sshn               float64       
 16  rrr24              float64       
 17  Penman             float64       
 18  FAO-56             float64       
 19  Priestley-Taylor   float64       
 20  Kimberly-Penman    float64   