<div style="color:green">
    <center>
        <h1><b>IRIMO Dataset</b></h1>
    </center>
</div>

<div style="color:orange">
        <h3><b>Import Libraries</b></h3>
</div>

In [1]:
import os
import sys
import pickle
import warnings
import logging
import sqlite3
import glob
import zipfile
import numpy as np
import pandas as pd
import geopandas as gpd
from tabulate import tabulate
from tqdm import tqdm
from pyproj import CRS
import plotly.express as px
import plotly.graph_objects as go
import contextily as ctx
import dotenv
import rasterio
import rasterio.merge
from rasterio.mask import mask
import matplotlib.pyplot as plt
from matplotlib.colors import LightSource

# Load Environment Variables From .env File
dotenv.load_dotenv()

# Setup Logging
logging.basicConfig(level=logging.INFO)

# Suppress Warnings
warnings.filterwarnings("ignore")

# Set Display Options for Pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_colwidth', None)

<div style="color:orange">
        <h3><b>Load Data</b></h3>
</div>

In [2]:
# Define file paths using environment variables
DATA_FOLDER_PATH = os.getenv('DATA_FOLDER_PATH')
GEO_DATA_PATH = os.getenv('GEO_DATA_PATH')
DATA_NAME = os.getenv('DATA_NAME')

if not DATA_FOLDER_PATH or not DATA_NAME or not GEO_DATA_PATH:
    logging.error("Environment variables DATA_FOLDER_PATH, DATA_NAME, or GEO_DATA_PATH are not set.")
    sys.exit(1)

# Construct full file paths
DATA_PATH = os.path.join(DATA_FOLDER_PATH, DATA_NAME + '.parquet')

# Load datasets
try:
    # Read Parquet file with pandas
    data = pd.read_parquet(DATA_PATH)
    logging.info(f"Data loaded successfully from {DATA_PATH}")
except Exception as e:
    logging.error(f"Error loading data from {DATA_PATH}: {e}")
    sys.exit(1)

INFO:root:Data loaded successfully from C:\Users\Pooya\Dropbox\IRIMO\Export\Mazandaran_Monthly_ETo_1951_2025.parquet


In [3]:
data.loc[data['rrr24_count'] == 0, 'rrr24'] = np.nan
data.loc[data['evt_count'] == 0, 'evt'] = np.nan

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13455 entries, 0 to 20630
Data columns (total 65 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   year               13455 non-null  int32         
 1   month              13455 non-null  int32         
 2   region_id          13455 non-null  object        
 3   region_name        13455 non-null  object        
 4   station_id         13455 non-null  object        
 5   station_name       13455 non-null  object        
 6   lat                13455 non-null  float64       
 7   lon                13455 non-null  float64       
 8   station_elevation  13455 non-null  float64       
 9   tmax               5790 non-null   float64       
 10  tmax_count         13455 non-null  int64         
 11  tmin               5791 non-null   float64       
 12  tmin_count         13455 non-null  int64         
 13  tm                 5790 non-null   float64       
 14  tm_count   

In [5]:
data.station_id.unique()

array(['99361', '99309', '40736', '99357', '99306', '99299', '40737',
       '40760', '99348', '40734', '99360', '40732', '40759', '40788',
       '40735'], dtype=object)

<div style="color:orange">
        <h3><b>Data Cleansing</b></h3>
</div>

<div style="color:red">
        <span><b>Show the columns of the dataset</b></span>
</div>

In [6]:
print(
    tabulate(
        tabular_data=[["Column Name"]] + [[column] for column in list(data.columns)],
        headers="firstrow",
        tablefmt="github",
    )    
)

| Column Name       |
|-------------------|
| year              |
| month             |
| region_id         |
| region_name       |
| station_id        |
| station_name      |
| lat               |
| lon               |
| station_elevation |
| tmax              |
| tmax_count        |
| tmin              |
| tmin_count        |
| tm                |
| tm_count          |
| umax              |
| umax_count        |
| umin              |
| umin_count        |
| um                |
| um_count          |
| ffm               |
| ffm_count         |
| sshn              |
| sshn_count        |
| pm                |
| pm_count          |
| p0m               |
| p0m_count         |
| ewm               |
| ewm_count         |
| radglo24          |
| radglo24_count    |
| td_m              |
| td_m_count        |
| twet_m            |
| twet_m_count      |
| tsoil_m           |
| tsoil_m_count     |
| ewsm              |
| ewsm_count        |
| evt               |
| evt_count         |
| rrr24   

In [7]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13455 entries, 0 to 13454
Data columns (total 65 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   year               13455 non-null  int32         
 1   month              13455 non-null  int32         
 2   region_id          13455 non-null  object        
 3   region_name        13455 non-null  object        
 4   station_id         13455 non-null  object        
 5   station_name       13455 non-null  object        
 6   lat                13455 non-null  float64       
 7   lon                13455 non-null  float64       
 8   station_elevation  13455 non-null  float64       
 9   tmax               5790 non-null   float64       
 10  tmax_count         13455 non-null  int64         
 11  tmin               5791 non-null   float64       
 12  tmin_count         13455 non-null  int64         
 13  tm                 5790 non-null   float64       
 14  tm_cou

<div style="color:orange">
        <h3><b>Find Duplicates Stations</b></h3>
</div>

In [8]:
d = data[['region_id', 'region_name', 'station_name', 'station_id']].drop_duplicates().reset_index(drop=True)
d[d.duplicated(subset=['region_name', 'station_name'], keep=False)]

Unnamed: 0,region_id,region_name,station_name,station_id


<div style="color:orange">
        <h3><b>Extract GeoInfo</b></h3>
</div>

<div style="color:red">
        <span><b>Stations</b></span>
</div>

In [9]:
geo_info = data[['region_id', 'region_name', 'station_name', 'station_id', 'lat', 'lon', 'station_elevation']].drop_duplicates().reset_index(drop=True)
geo_info = geo_info.reset_index(drop=True)
geo_info

Unnamed: 0,region_id,region_name,station_name,station_id,lat,lon,station_elevation
0,MASA,Mazandaran,Alasht,99361,36.07,52.84,1805.0
1,MASA,Mazandaran,Amol,99309,36.48,52.47,23.7
2,MASA,Mazandaran,Babolsar,40736,36.7,52.64,-21.0
3,MASA,Mazandaran,Baladeh,99357,36.2,51.8,2120.0
4,MASA,Mazandaran,Bandar-E-Amirabad,99306,36.86,53.39,-20.0
5,MASA,Mazandaran,Galugah,99299,36.74,53.84,-10.0
6,MASA,Mazandaran,Gharakhil,40737,36.49,52.11,14.7
7,MASA,Mazandaran,Kiyasar,40760,36.25,53.55,1294.3
8,MASA,Mazandaran,Kojur,99348,36.39,51.73,1550.0
9,MASA,Mazandaran,Nowshahr,40734,36.66,51.47,-20.9


In [10]:
geo_info_gpd = gpd.GeoDataFrame(
    data=geo_info,
    geometry=gpd.points_from_xy(geo_info["lon"], geo_info["lat"])
)

geo_info_gpd.crs = CRS("EPSG:4326")

geo_info_gpd.sort_values(by=["region_name", "station_name"]).reset_index(drop=True, inplace=True)

geo_info_gpd

Unnamed: 0,region_id,region_name,station_name,station_id,lat,lon,station_elevation,geometry
0,MASA,Mazandaran,Alasht,99361,36.07,52.84,1805.0,POINT (52.84 36.07)
1,MASA,Mazandaran,Amol,99309,36.48,52.47,23.7,POINT (52.47 36.48)
2,MASA,Mazandaran,Babolsar,40736,36.7,52.64,-21.0,POINT (52.64 36.7)
3,MASA,Mazandaran,Baladeh,99357,36.2,51.8,2120.0,POINT (51.8 36.2)
4,MASA,Mazandaran,Bandar-E-Amirabad,99306,36.86,53.39,-20.0,POINT (53.39 36.86)
5,MASA,Mazandaran,Galugah,99299,36.74,53.84,-10.0,POINT (53.84 36.74)
6,MASA,Mazandaran,Gharakhil,40737,36.49,52.11,14.7,POINT (52.11 36.49)
7,MASA,Mazandaran,Kiyasar,40760,36.25,53.55,1294.3,POINT (53.55 36.25)
8,MASA,Mazandaran,Kojur,99348,36.39,51.73,1550.0,POINT (51.73 36.39)
9,MASA,Mazandaran,Nowshahr,40734,36.66,51.47,-20.9,POINT (51.47 36.66)


<div style="color:orange">
        <h3><b>Work on DEM Files</b></h3>
</div>


In [11]:
SRTM_ZIP_DIR = f"{GEO_DATA_PATH}/SRTM"
HGT_DIR      = f"{GEO_DATA_PATH}/SRTM/HGT"
DEM_DIR      = f"{GEO_DATA_PATH}/SRTM/DEM"

os.makedirs(HGT_DIR, exist_ok=True)
os.makedirs(DEM_DIR, exist_ok=True)

MERGED_DEM   = os.path.join(DEM_DIR, "SRTM_30m_merged.tif")
CLIPPED_DEM  = os.path.join(DEM_DIR, "Mazandaran_SRTM_30m.tif")
OSTAN_SHP    = f"{GEO_DATA_PATH}/Iran_1400/Ostan.shp"

In [12]:
zip_files = glob.glob(os.path.join(SRTM_ZIP_DIR, "*.hgt.zip"))
print(f"تعداد فایل zip پیدا شد: {len(zip_files)}")

for zpath in zip_files:
    with zipfile.ZipFile(zpath, "r") as zf:
        zf.extractall(HGT_DIR)

hgt_files = glob.glob(os.path.join(HGT_DIR, "*.hgt"))
print(f"تعداد فایل HGT پیدا شد: {len(hgt_files)}")

تعداد فایل zip پیدا شد: 10


تعداد فایل HGT پیدا شد: 10


In [13]:
src_files_to_mosaic = []
for fp in hgt_files:
    src = rasterio.open(fp)
    src_files_to_mosaic.append(src)

mosaic, out_transform = rasterio.merge.merge(src_files_to_mosaic)

# از اولین رستر یک پروفایل برداریم
out_meta = src_files_to_mosaic[0].meta.copy()
out_meta.update({
    "driver": "GTiff",
    "height": mosaic.shape[1],
    "width": mosaic.shape[2],
    "transform": out_transform,
    "count": 1,
    "dtype": mosaic.dtype
})

with rasterio.open(MERGED_DEM, "w", **out_meta) as dest:
    dest.write(mosaic)

print("DEM یکپارچه ذخیره شد:", MERGED_DEM)

for s in src_files_to_mosaic:
    s.close()

DEM یکپارچه ذخیره شد: ../assets/geo_data/SRTM/DEM\SRTM_30m_merged.tif


In [14]:
ostan_gpd = gpd.read_file(OSTAN_SHP)

with rasterio.open(MERGED_DEM) as src:
    dem_crs = src.crs

ostan_dem_crs = ostan_gpd.to_crs(dem_crs)

shapes = [geom for geom in ostan_dem_crs.geometry]

with rasterio.open(MERGED_DEM) as src:
    out_image, out_transform = mask(src, shapes, crop=True)
    out_meta = src.meta.copy()

out_meta.update({
    "height": out_image.shape[1],
    "width": out_image.shape[2],
    "transform": out_transform
})

with rasterio.open(CLIPPED_DEM, "w", **out_meta) as dest:
    dest.write(out_image)

print("DEM کلیپ‌شده ذخیره شد:", CLIPPED_DEM)

DEM کلیپ‌شده ذخیره شد: ../assets/geo_data/SRTM/DEM\Mazandaran_SRTM_30m.tif


<div style="color:red">
        <span><b>Map</b></span>
</div>

In [15]:
ostan_gpd = gpd.read_file(
    filename=f"{GEO_DATA_PATH}/Iran_1400/Ostan.shp"
)

if ostan_gpd.crs is not None and ostan_gpd.crs.to_epsg() != 4326:
    ostan_gpd = ostan_gpd.to_crs(4326)

ostan_gpd["id"] = ostan_gpd.index.astype(str)
ostan_json = ostan_gpd.__geo_interface__

geo_info_map = geo_info_gpd.copy()
geo_info_map['Hover_Info'] = geo_info_map.apply(
    lambda row: f"""
    Station Name: <b>{row['station_name']}</b><br>
    Station ID: <b>{row['station_id']}</b><br>
    Province: <b>{row['region_name']}</b><br>
    Latitude: <b>{row['lat']}</b><br>
    Longitude: <b>{row['lon']}</b><br>
    Elevation: <b>{row['station_elevation']} m</b><br>
    """,
    axis=1
)

fig = px.scatter_mapbox(
    geo_info_map,
    lat=geo_info_map.geometry.y,
    lon=geo_info_map.geometry.x,
    color="region_name",
    hover_name=geo_info_map['station_name'],
    hover_data={'station_id': True, 'station_name': False, 'Hover_Info': False},
    custom_data=['Hover_Info'],
    zoom=7,
    height=600,
    width=1000
)

fig.add_trace(
    go.Choroplethmapbox(
        geojson=ostan_json,
        locations=ostan_gpd["id"],
        z=[1] * len(ostan_gpd),
        featureidkey="properties.id",
        showscale=False,
        marker=dict(
            opacity=0.2,
            line=dict(width=1.2, color="black")
        ),
        hoverinfo="skip"
    )
)

fig.update_layout(
    mapbox={
        "style": {
            "version": 8,
            "sources": {
                "elev_src": {
                    "type": "raster",
                    "tiles": [
                        "https://tile.opentopomap.org/{z}/{x}/{y}.png"
                    ],
                    "tileSize": 256
                }
            },
            "layers": [
                {
                    "id": "elevation-only",
                    "type": "raster",
                    "source": "elev_src"
                }
            ]
        }
    },
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Arial",
        align="left"
    ),
    legend=dict(
        yanchor="top",
        y=0.98,
        xanchor="left",
        x=0.01,
        title="",
        traceorder="normal",
        title_font_family="Times New Roman",
        font=dict(
            family="Times New Roman",
            size=14,
            color="black"
        ),
        bgcolor="snow",
        bordercolor="Black",
        borderwidth=0.5
    )
)

# ❗ Only update the station markers (scattermapbox), not the choropleth
fig.update_traces(
    hovertemplate="%{customdata[0]}",
    marker=dict(size=20),
    selector=dict(type="scattermapbox")  # <-- key line
)

fig.show()


<div style="color:red">
        <span><b>Export GeoInfo</b></span>
</div>

In [16]:
GEO_DATA_PATH = "../assets/geo_data"

geo_info_to_file = geo_info_gpd.rename(
    columns={
        "region_name": "Region",
        "station_id": "St_ID",
        "station_name": "St_Name",
        "lat": "St_Lat",
        "lon": "St_Lon",
        "station_elevation": "St_Ele",
    }
)

geo_info_to_file.to_file(
    filename=f'{GEO_DATA_PATH}/MazandaranStationsIRIMO.geojson',
    driver='GeoJSON',
    index=False
)

geo_info_to_file.to_file(
    filename=f'{GEO_DATA_PATH}/MazandaranStationsIRIMO.shp',
    driver='ESRI Shapefile',
    index=False
)  

INFO:pyogrio._io:Created 15 records
INFO:pyogrio._io:Created 15 records


<div style="color:orange">
        <h3><b>Database</b></h3>
</div>

In [19]:
data["date"] = pd.to_datetime(
    data["year"].astype(str) + "-" + data["month"].astype(str) + "-01"
) + pd.offsets.MonthEnd(0)
data

Unnamed: 0,year,month,region_id,region_name,station_id,station_name,lat,lon,station_elevation,tmax,tmax_count,tmin,tmin_count,tm,tm_count,umax,umax_count,umin,umin_count,um,um_count,ffm,ffm_count,sshn,sshn_count,pm,pm_count,p0m,p0m_count,ewm,ewm_count,radglo24,radglo24_count,td_m,td_m_count,twet_m,twet_m_count,tsoil_m,tsoil_m_count,ewsm,ewsm_count,evt,evt_count,rrr24,rrr24_count,date,Penman,Penman-Monteith,ASCE-PM,FAO-56,Priestley-Taylor,Kimberly-Penman,Thom-Oliver,Blaney-Criddle,Hamon,Romanenko,Linacre,Turc,Jensen-Haise,Mcguinness-Bordne,Hargreaves,FAO-24,Abtew,Makkink,Oudin
0,1951,1,MASA,Mazandaran,99361,Alasht,36.07,52.84,1805.00,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,1951-01-31,,,,,,,,,,,,,,,,,,,
1,1951,2,MASA,Mazandaran,99361,Alasht,36.07,52.84,1805.00,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,1951-02-28,,,,,,,,,,,,,,,,,,,
2,1951,3,MASA,Mazandaran,99361,Alasht,36.07,52.84,1805.00,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,1951-03-31,,,,,,,,,,,,,,,,,,,
3,1951,4,MASA,Mazandaran,99361,Alasht,36.07,52.84,1805.00,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,1951-04-30,,,,,,,,,,,,,,,,,,,
4,1951,5,MASA,Mazandaran,99361,Alasht,36.07,52.84,1805.00,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,,0,1951-05-31,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13450,2025,5,MASA,Mazandaran,40735,Siahbisheh,36.23,51.30,1855.40,22.35,31,10.65,31,16.51,31,81.48,31,35.16,31,56.75,31,2.54,31,7.90,31,1010.27,31,815.66,31,9.86,31,2126.08,24,6.23,31,11.12,31,9.32,31,19.57,31,147.90,31,37.41,31,2025-05-31,131.04,145.09,145.50,145.50,136.53,119.44,133.06,99.97,117.24,192.01,135.54,118.02,130.88,159.02,135.75,129.23,142.22,120.13,108.18
13451,2025,6,MASA,Mazandaran,40735,Siahbisheh,36.23,51.30,1855.40,21.96,30,11.50,30,16.73,30,88.63,30,48.67,30,68.79,30,2.34,30,6.20,30,1009.17,30,814.88,30,11.89,30,1814.59,27,8.96,30,12.28,30,9.93,30,19.70,30,114.80,30,12.81,30,2025-06-30,117.37,122.33,122.68,122.68,128.04,109.54,115.36,103.08,123.83,142.01,131.42,105.36,116.40,162.29,130.50,103.82,125.07,106.05,110.40
13452,2025,7,MASA,Mazandaran,40735,Siahbisheh,36.23,51.30,1855.40,25.22,29,15.04,31,20.12,29,87.39,31,46.10,31,67.74,31,2.12,31,5.54,31,1005.78,31,814.43,31,14.72,31,1686.18,22,12.30,31,15.36,31,13.68,31,24.33,31,159.10,31,17.20,31,2025-07-31,122.83,132.63,132.71,132.71,132.79,114.06,127.20,119.10,153.34,179.71,164.84,111.58,132.51,190.40,143.48,106.32,121.51,108.72,129.52
13453,2025,8,MASA,Mazandaran,40735,Siahbisheh,36.23,51.30,1855.40,27.47,30,16.04,31,21.76,30,82.10,31,37.42,31,59.35,31,2.38,31,7.30,29,1007.41,31,816.65,31,14.01,31,1837.62,13,11.19,31,15.60,31,14.29,31,26.85,31,200.30,31,9.20,31,2025-08-31,132.17,152.33,152.23,152.23,136.99,121.67,142.22,115.78,149.96,232.97,183.01,122.99,152.72,185.58,145.12,126.03,130.76,119.73,126.24


<div style="color:red">
        <span><b>Write to Database</b></span>
</div>

In [20]:
DATABASE_PATH = "../database/database.db"
conn = sqlite3.connect(DATABASE_PATH)

data.to_sql('ground_data_monthly', conn, if_exists='replace', index=False)
geo_info.to_sql('ground_data_geoinfo', conn, if_exists='replace', index=False)

conn.commit()
conn.close()