<div style="color:green">
    <center>
        <h1><b>IRIMO Dataset</b></h1>
    </center>
</div>

<div style="color:orange">
        <h3><b>Import Libraries</b></h3>
</div>

In [175]:
import os
import sys
import pickle
import warnings
import logging
import sqlite3
import numpy as np
import pandas as pd
import geopandas as gpd
from tabulate import tabulate
from tqdm import tqdm
from pyproj import CRS
import plotly.express as px
import dotenv

# Load Environment Variables From .env File
dotenv.load_dotenv()

# Setup Logging
logging.basicConfig(level=logging.INFO)

# Suppress Warnings
warnings.filterwarnings("ignore")

# Set Display Options for Pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_colwidth', None)

<div style="color:orange">
        <h3><b>Load Data</b></h3>
</div>

In [176]:
# Define file paths using environment variables
DATA_FOLDER_PATH = os.getenv('DATA_FOLDER_PATH')
DATA_NAME = os.getenv('DATA_NAME')
GEOINFO_NAME = os.getenv('GEOINFO_NAME')

if not DATA_FOLDER_PATH or not DATA_NAME or not GEOINFO_NAME:
    logging.error("Environment variables DATA_FOLDER_PATH, DATA_NAME, or GEOINFO_NAME are not set.")
    sys.exit(1)

# Construct full file paths
DATA_PATH = os.path.join(DATA_FOLDER_PATH, DATA_NAME + '.parquet')
GEOINFO_PATH = os.path.join(DATA_FOLDER_PATH, GEOINFO_NAME + '.geojson')

# Load datasets
try:
    # Read Parquet file with pandas
    data = pd.read_parquet(DATA_PATH)
    logging.info(f"Data loaded successfully from {DATA_PATH}")
except Exception as e:
    logging.error(f"Error loading data from {DATA_PATH}: {e}")
    sys.exit(1)

try:
    # Read GeoJSON file with geopandas    
    geoinfo = gpd.read_file(GEOINFO_PATH)
    logging.info(f"Geoinfo loaded successfully from {GEOINFO_PATH}")
except Exception as e:
    logging.error(f"Error loading geoinfo from {GEOINFO_PATH}: {e}")
    sys.exit(1)

INFO:root:Data loaded successfully from C:/Users/Pooya/Dropbox/IRIMO/Export/Iran_Data_1951_2025\IRIMO_Data.parquet
INFO:root:Geoinfo loaded successfully from C:/Users/Pooya/Dropbox/IRIMO/Export/Iran_Data_1951_2025\IRIMO_GeoInfo.geojson


In [177]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4928061 entries, 0 to 4928060
Data columns (total 55 columns):
 #   Column             Dtype         
---  ------             -----         
 0   station_id         category      
 1   station_name       category      
 2   region_id          category      
 3   region_name        category      
 4   lat                float64       
 5   lon                float64       
 6   station_elevation  float64       
 7   date               datetime64[ns]
 8   ff_max             float64       
 9   dd_max             float64       
 10  ffm                float64       
 11  tmax               float64       
 12  tmin               float64       
 13  tm                 float64       
 14  pm                 float64       
 15  p0m                float64       
 16  ewm                float64       
 17  rrr24              float64       
 18  nm                 float64       
 19  nmax               float64       
 20  essmax             float

<div style="color:orange">
        <h3><b>Data Cleansing</b></h3>
</div>

<div style="color:red">
        <span><b>Show the columns of the dataset</b></span>
</div>

In [178]:
print(
    tabulate(
        tabular_data=[["Column Name"]] + [[column] for column in list(data.columns)],
        headers="firstrow",
        tablefmt="github",
    )    
)

| Column Name       |
|-------------------|
| station_id        |
| station_name      |
| region_id         |
| region_name       |
| lat               |
| lon               |
| station_elevation |
| date              |
| ff_max            |
| dd_max            |
| ffm               |
| tmax              |
| tmin              |
| tm                |
| pm                |
| p0m               |
| ewm               |
| rrr24             |
| nm                |
| nmax              |
| essmax            |
| umax              |
| umin              |
| um                |
| sshn              |
| radglo24          |
| evt               |
| td_m              |
| twet_m            |
| ss24              |
| tsoil_m           |
| tsoil_min         |
| ewsm              |
| vvmin             |
| ff_gust_max       |
| dd_gust_max       |
| pmin              |
| rrrtmax           |
| nsn               |
| nrasn             |
| ngf               |
| nshra             |
| ndu               |
| nbdu    

<div style="color:red">
        <span><b>Show all provinces</b></span>
</div>

In [179]:
print(
    tabulate(
        tabular_data=[["Provinces Name"]] + [[province] for province in list(data.region_name.unique())],
        headers="firstrow",
        tablefmt="github",
    )    
)

| Provinces Name            |
|---------------------------|
| Airforce                  |
| Alborz                    |
| Ardebil                   |
| Azarbayjan-E-Gharbi       |
| Azarbayjan-E-Sharghi      |
| Bushehr                   |
| Chaharmahal Va Bakhtiari  |
| Esfahan                   |
| Fars                      |
| Gilan                     |
| Golestan                  |
| Hamedan                   |
| Hormozgan                 |
| Ilam                      |
| Kerman                    |
| Kermanshah                |
| Khohgiluyeh Va Boyerahmad |
| Khorasan Razavi           |
| Khuzestan                 |
| Kordestan                 |
| Lorestan                  |
| Markazi                   |
| Mazandaran                |
| North Khorasan            |
| Qazvin                    |
| Qom                       |
| Semnan                    |
| Sistan Va Baluchestan     |
| South Khorasan            |
| Tehran                    |
| Yazd                      |
| Zanjan  

<div style="color:red">
        <span><b>Filter Data</b></span>
</div>

In [180]:
selected_regions = ["Mazandaran"]

selected_columns = [
    'region_id',
    'region_name',
    'station_id',
    'station_name',
    'lat',
    'lon',
    'station_elevation',
    'date',
    'tmax',
    'tmin',
    'tm',
    'rrr24',
]

data = data.query("region_name in @selected_regions").filter(items=selected_columns).copy().reset_index(drop=True)

def remove_unused_if_cat(s):
    if pd.api.types.is_categorical_dtype(s.dtype):
        return s.cat.remove_unused_categories()
    else:
        return s

data = data.apply(remove_unused_if_cat)

data = data.rename(
    columns={
        "region_name": "Province",  
        "region_id": "Province_ID",  
        "station_name": "Station_Name",     
        "station_id": "Station_ID",        
        "lat": "Station_Latitude",              
        "lon": "Station_Longitude",              
        "station_elevation": "Station_Elevation",
        "date": "Date",
        "tmin": "Temperature_Minimum",
        "tmax": "Temperature_Maximum",
        "tm": "Temperature",
        "rrr24": "Precipitation",        
    }
)

print(
    tabulate(
        tabular_data=[["Provinces Name"]] + [[province] for province in list(data.Province.unique())],
        headers="firstrow",
        tablefmt="github",
    )    
)

print("\n")

print(
    tabulate(
        tabular_data=[["Column Name"]] + [[column] for column in list(data.columns)],
        headers="firstrow",
        tablefmt="github",
    )    
)

| Provinces Name   |
|------------------|
| Mazandaran       |


| Column Name         |
|---------------------|
| Province_ID         |
| Province            |
| Station_ID          |
| Station_Name        |
| Station_Latitude    |
| Station_Longitude   |
| Station_Elevation   |
| Date                |
| Temperature_Maximum |
| Temperature_Minimum |
| Temperature         |
| Precipitation       |


In [181]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186076 entries, 0 to 186075
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Province_ID          186076 non-null  category      
 1   Province             186076 non-null  category      
 2   Station_ID           186076 non-null  category      
 3   Station_Name         186076 non-null  category      
 4   Station_Latitude     186076 non-null  float64       
 5   Station_Longitude    186076 non-null  float64       
 6   Station_Elevation    186076 non-null  float64       
 7   Date                 186076 non-null  datetime64[ns]
 8   Temperature_Maximum  184629 non-null  float64       
 9   Temperature_Minimum  184500 non-null  float64       
 10  Temperature          183250 non-null  float64       
 11  Precipitation        185851 non-null  float64       
dtypes: category(4), datetime64[ns](1), float64(7)
memory usage: 12.1 MB


<div style="color:orange">
        <h3><b>Data Quality Control</b></h3>
</div>

In [182]:
def data_quality_control(
    df, 
    required_vars = ['Temperature']
):
    if 'Temperature' in required_vars and 'Temperature' in df.columns:
        df.loc[(df['Temperature'] < -60) | (df['Temperature'] > 80), 'Temperature'] = np.nan
    if 'Temperature_Maximum' in required_vars and 'Temperature_Maximum' in df.columns:
        df.loc[(df['Temperature_Maximum'] < -60) | (df['Temperature_Maximum'] > 80), 'Temperature_Maximum'] = np.nan
    if 'Temperature_Minimum' in required_vars and 'Temperature_Minimum' in df.columns:
        df.loc[(df['Temperature_Minimum'] < -60) | (df['Temperature_Minimum'] > 80), 'Temperature_Minimum'] = np.nan
    if 'Precipitation' in required_vars and 'Precipitation' in df.columns:
        df.loc[(df['Precipitation'] < 0) | (df['Precipitation'] > 1000), 'Precipitation'] = np.nan
    return df


data = data_quality_control(
    df=data, 
    required_vars=['Temperature', 'Temperature_Maximum', 'Temperature_Minimum', 'Precipitation']
)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186076 entries, 0 to 186075
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Province_ID          186076 non-null  category      
 1   Province             186076 non-null  category      
 2   Station_ID           186076 non-null  category      
 3   Station_Name         186076 non-null  category      
 4   Station_Latitude     186076 non-null  float64       
 5   Station_Longitude    186076 non-null  float64       
 6   Station_Elevation    186076 non-null  float64       
 7   Date                 186076 non-null  datetime64[ns]
 8   Temperature_Maximum  184629 non-null  float64       
 9   Temperature_Minimum  184500 non-null  float64       
 10  Temperature          183250 non-null  float64       
 11  Precipitation        185851 non-null  float64       
dtypes: category(4), datetime64[ns](1), float64(7)
memory usage: 12.1 MB


<div style="color:orange">
        <h3><b>Fill Missing Dates</b></h3>
</div>

In [183]:
d = data[['Province_ID', 'Province', 'Station_Name', 'Station_ID', 'Station_Latitude', 'Station_Longitude', 'Station_Elevation']].drop_duplicates().reset_index(drop=True)
d[d.duplicated(subset=['Province', 'Station_Name'], keep=False)]

Unnamed: 0,Province_ID,Province,Station_Name,Station_ID,Station_Latitude,Station_Longitude,Station_Elevation


In [184]:
all_dates = pd.date_range(data['Date'].min(), data['Date'].max(), freq='D')

n = 1

def reindex_station(station_df):
    global n
    print(f"{n}: Reindexing station {station_df['Province'].iloc[0]} - {station_df['Station_Name'].iloc[0]} - {station_df['Station_Elevation'].iloc[0]} - {station_df['Station_Latitude'].iloc[0]} - {station_df['Station_Longitude'].iloc[0]}")
    n += 1
    return station_df.drop_duplicates('Date').set_index('Date').reindex(all_dates).assign(
        Province_ID=station_df['Province_ID'].iloc[0],
        Province=station_df['Province'].iloc[0],
        Station_ID=station_df['Station_ID'].iloc[0],
        Station_Name=station_df['Station_Name'].iloc[0],
        Station_Elevation=station_df['Station_Elevation'].iloc[0],
        Station_Latitude=station_df['Station_Latitude'].iloc[0],
        Station_Longitude=station_df['Station_Longitude'].iloc[0],
    ).reset_index().rename(columns={'index': 'Date'})

daily_data = (
    data.groupby(['Province_ID', 'Province', 'Station_ID', 'Station_Name'], group_keys=False)
    .apply(reindex_station)
    .reset_index(drop=True)
)

1: Reindexing station Mazandaran - Tonekabon - 0.0 - 36.81 - 50.9
2: Reindexing station Mazandaran - Jouybar - -17.0 - 36.69139 - 52.89667
3: Reindexing station Mazandaran - Bisheh Bone - 1440.0 - 36.567 - 53.816
4: Reindexing station Mazandaran - Kelardasht - 1200.0 - 36.55 - 51.157
5: Reindexing station Mazandaran - Babolkenar - 91.0 - 36.35 - 52.717
6: Reindexing station Mazandaran - Dalir - 1995.0 - 36.32306 - 51.10194
7: Reindexing station Mazandaran - Shayadehband Pay - 213.0 - 36.317 - 52.553
8: Reindexing station Mazandaran - Ramsar - -20.0 - 36.905 - 50.684
9: Reindexing station Mazandaran - Nowshahr - -20.9 - 36.661 - 51.466
10: Reindexing station Mazandaran - Siahbisheh - 1855.4 - 36.231 - 51.303
11: Reindexing station Mazandaran - Babolsar - -21.0 - 36.699 - 52.643
12: Reindexing station Mazandaran - Gharakhil - 14.7 - 36.487 - 52.108
13: Reindexing station Mazandaran - Sari - 23.0 - 36.536 - 52.998
14: Reindexing station Mazandaran - Kiyasar - 1294.3 - 36.248 - 53.546
15: 

In [185]:
daily_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 655032 entries, 0 to 655031
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Date                 655032 non-null  datetime64[ns]
 1   Province_ID          655032 non-null  object        
 2   Province             655032 non-null  object        
 3   Station_ID           655032 non-null  object        
 4   Station_Name         655032 non-null  object        
 5   Station_Latitude     655032 non-null  float64       
 6   Station_Longitude    655032 non-null  float64       
 7   Station_Elevation    655032 non-null  float64       
 8   Temperature_Maximum  184629 non-null  float64       
 9   Temperature_Minimum  184500 non-null  float64       
 10  Temperature          183250 non-null  float64       
 11  Precipitation        185851 non-null  float64       
dtypes: datetime64[ns](1), float64(7), object(4)
memory usage: 60.0+ MB


In [186]:
all_dates

DatetimeIndex(['1951-01-01 12:00:00', '1951-01-02 12:00:00',
               '1951-01-03 12:00:00', '1951-01-04 12:00:00',
               '1951-01-05 12:00:00', '1951-01-06 12:00:00',
               '1951-01-07 12:00:00', '1951-01-08 12:00:00',
               '1951-01-09 12:00:00', '1951-01-10 12:00:00',
               ...
               '2025-09-12 12:00:00', '2025-09-13 12:00:00',
               '2025-09-14 12:00:00', '2025-09-15 12:00:00',
               '2025-09-16 12:00:00', '2025-09-17 12:00:00',
               '2025-09-18 12:00:00', '2025-09-19 12:00:00',
               '2025-09-20 12:00:00', '2025-09-21 12:00:00'],
              dtype='datetime64[ns]', length=27293, freq='D')

<div style="color:orange">
        <h3><b>Monthly Data</b></h3>
</div>

In [187]:
daily_data["Year"] = daily_data['Date'].dt.year
daily_data["Month"] = daily_data['Date'].dt.month

monthly_data = daily_data.groupby(['Year', 'Month', 'Province_ID', 'Province', 'Station_ID', 'Station_Name']).agg(
    {
        'Station_Latitude': 'first',
        'Station_Longitude': 'first',
        'Station_Elevation': 'first',
        'Temperature_Maximum': ['mean', 'count'],
        'Temperature_Minimum': ['mean', 'count'], 
        'Temperature': ['mean', 'count'],
        'Precipitation': ['sum', 'count'],
    }).reset_index()

monthly_data.columns = ['_'.join(col).strip() if col[1] else col[0] for col in monthly_data.columns.values]
monthly_data.rename(columns=lambda x: x.replace('_mean', '').replace('_sum', '').replace('_first', ''), inplace=True)
monthly_data = monthly_data.round(2)
monthly_data.sort_values(by=['Province', 'Station_Name', 'Station_ID', 'Year', 'Month'], inplace=True)
monthly_data.reset_index(drop=True, inplace=True)

monthly_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21528 entries, 0 to 21527
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       21528 non-null  int32  
 1   Month                      21528 non-null  int32  
 2   Province_ID                21528 non-null  object 
 3   Province                   21528 non-null  object 
 4   Station_ID                 21528 non-null  object 
 5   Station_Name               21528 non-null  object 
 6   Station_Latitude           21528 non-null  float64
 7   Station_Longitude          21528 non-null  float64
 8   Station_Elevation          21528 non-null  float64
 9   Temperature_Maximum        6123 non-null   float64
 10  Temperature_Maximum_count  21528 non-null  int64  
 11  Temperature_Minimum        6124 non-null   float64
 12  Temperature_Minimum_count  21528 non-null  int64  
 13  Temperature                6123 non-null   flo

In [188]:
# if Precipitation_count == 0 then Precipitation = NaN
monthly_data.loc[monthly_data['Precipitation_count'] == 0, 'Precipitation'] = np.nan

<div style="color:orange">
        <h3><b>Extract GeoInfo</b></h3>
</div>

<div style="color:red">
        <span><b>Stations</b></span>
</div>

In [189]:
geo_info = data[["Province_ID", "Province", "Station_ID", "Station_Name", "Station_Latitude", "Station_Longitude", "Station_Elevation"]].drop_duplicates().reset_index(drop=True)
geo_info = geo_info.reset_index(drop=True)
geo_info

Unnamed: 0,Province_ID,Province,Station_ID,Station_Name,Station_Latitude,Station_Longitude,Station_Elevation
0,MASA,Mazandaran,99361,Alasht,36.07,52.84,1805.0
1,MASA,Mazandaran,99309,Amol,36.48,52.47,23.7
2,MASA,Mazandaran,18420,Babolkenar,36.35,52.72,91.0
3,MASA,Mazandaran,40736,Babolsar,36.7,52.64,-21.0
4,MASA,Mazandaran,99357,Baladeh,36.2,51.8,2120.0
5,MASA,Mazandaran,99306,Bandar-E-Amirabad,36.86,53.39,-20.0
6,MASA,Mazandaran,18389,Bisheh Bone,36.57,53.82,1440.0
7,MASA,Mazandaran,18423,Dalir,36.32,51.1,1995.0
8,MASA,Mazandaran,99299,Galugah,36.74,53.84,-10.0
9,MASA,Mazandaran,40737,Gharakhil,36.49,52.11,14.7


In [190]:
geo_info_gpd = gpd.GeoDataFrame(
    data=geo_info,
    geometry=gpd.points_from_xy(geo_info["Station_Longitude"], geo_info["Station_Latitude"])
)

geo_info_gpd.crs = CRS("EPSG:4326")

geo_info_gpd.sort_values(by=["Province", "Station_Name"]).reset_index(drop=True, inplace=True)

geo_info_gpd

Unnamed: 0,Province_ID,Province,Station_ID,Station_Name,Station_Latitude,Station_Longitude,Station_Elevation,geometry
0,MASA,Mazandaran,99361,Alasht,36.07,52.84,1805.0,POINT (52.843 36.071)
1,MASA,Mazandaran,99309,Amol,36.48,52.47,23.7,POINT (52.468 36.479)
2,MASA,Mazandaran,18420,Babolkenar,36.35,52.72,91.0,POINT (52.717 36.35)
3,MASA,Mazandaran,40736,Babolsar,36.7,52.64,-21.0,POINT (52.643 36.699)
4,MASA,Mazandaran,99357,Baladeh,36.2,51.8,2120.0,POINT (51.801 36.198)
5,MASA,Mazandaran,99306,Bandar-E-Amirabad,36.86,53.39,-20.0,POINT (53.386 36.856)
6,MASA,Mazandaran,18389,Bisheh Bone,36.57,53.82,1440.0,POINT (53.816 36.567)
7,MASA,Mazandaran,18423,Dalir,36.32,51.1,1995.0,POINT (51.10194 36.32306)
8,MASA,Mazandaran,99299,Galugah,36.74,53.84,-10.0,POINT (53.837 36.738)
9,MASA,Mazandaran,40737,Gharakhil,36.49,52.11,14.7,POINT (52.108 36.487)


<div style="color:red">
        <span><b>Map</b></span>
</div>

In [191]:
geo_info_map = geo_info_gpd.copy()

geo_info_map['Hover_Info'] = geo_info_map.apply(
    lambda row: f"""
    Station Name: <b>{row['Station_Name']}</b><br>
    Station ID: <b>{row['Station_ID']}</b><br>
    Province: <b>{row['Province']}</b><br>
    Latitude: <b>{row['Station_Latitude']}</b><br>
    Longitude: <b>{row['Station_Longitude']}</b><br>
    Elevation: <b>{row['Station_Elevation']} m</b><br>
    """,
    axis=1
)

fig = px.scatter_mapbox(
    geo_info_map,
    lat=geo_info_map.geometry.y,
    lon=geo_info_map.geometry.x,
    # size="Number_of_Records",
    color="Province",
    hover_name=geo_info_map['Station_Name'],
    hover_data={'Station_ID': True, 'Station_Name': False, 'Hover_Info': False},
    custom_data=['Hover_Info'],
    zoom=6,
    height=600,
    width=1000
)

# Add map style
fig.update_layout(
    mapbox_style="open-street-map",
    margin={"r":0,"t":0,"l":0,"b":0},
    hoverlabel=dict(        
        bgcolor="white",
        font_size=16,
        font_family="Arial",
        align="left"
    ),
    legend=dict(
        yanchor="top",
        y=0.98,
        xanchor="left",
        x=0.01,
        title="",
        traceorder="normal",
        title_font_family="Times New Roman",
        font=dict(
            family="Times New Roman",
            size=14,
            color="black"
        ),
        bgcolor="snow",
        bordercolor="Black",
        borderwidth=0.5
    )
)

fig.update_traces(
    hovertemplate="%{customdata[0]}",
    marker=dict(size=20)
)

# Show the plot
fig.show()

<div style="color:red">
        <span><b>Export GeoInfo</b></span>
</div>

In [192]:
GEO_DATA_PATH = "./assets/geo_data"

geo_info_to_file = geo_info_gpd.rename(
    columns={
        "Station_ID": "St_ID",
        "Station_Name": "St_Name",
        "Province": "Province",
        "Station_Latitude": "St_Lat",
        "Station_Longitude": "St_Lon",
        "Station_Elevation": "St_Ele",
    }
)

geo_info_to_file.to_file(
    filename=f'{GEO_DATA_PATH}/StationsIRIMO.geojson',
    driver='GeoJSON',
    index=False
)

geo_info_to_file.to_file(
    filename=f'{GEO_DATA_PATH}/StationsIRIMO.shp',
    driver='ESRI Shapefile',
    index=False
)  

INFO:pyogrio._io:Created 24 records


INFO:pyogrio._io:Created 24 records


<div style="color:orange">
        <h3><b>Database</b></h3>
</div>

<div style="color:red">
        <span><b>Write to Database</b></span>
</div>

In [193]:
DATABASE_PATH = "./database/database.db"
conn = sqlite3.connect(DATABASE_PATH)

daily_data.reset_index(inplace=True)
daily_data['Date'] = pd.to_datetime(daily_data['Date'])
daily_data['Date'] = daily_data['Date'].dt.strftime('%Y-%m-%d')
daily_data['Date'] = pd.to_datetime(daily_data['Date'])
daily_data.drop(columns=['Year', 'Month'], inplace=True)

daily_data.to_sql('ground_data_daily', conn, if_exists='replace', index=False)
monthly_data.to_sql('ground_data_monthly', conn, if_exists='replace', index=False)
geo_info.to_sql('ground_data_geoinfo', conn, if_exists='replace', index=False)

conn.commit()
conn.close()