# Load Libraries

In [1]:
import os
import warnings
import logging
import sys
import pickle
import numpy as np
import pandas as pd
import geopandas as gpd
import dotenv

# Load environment variables from .env file
dotenv.load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)

# Suppress warnings
warnings.filterwarnings("ignore")

# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_colwidth', None)

# Load Data

In [2]:
# Define file paths using environment variables
DATA_FOLDER_PATH = os.getenv('DATA_FOLDER_PATH')
DATA_NAME = os.getenv('DATA_NAME')
GEOINFO_NAME = os.getenv('GEOINFO_NAME')

if not DATA_FOLDER_PATH or not DATA_NAME or not GEOINFO_NAME:
    logging.error("Environment variables DATA_FOLDER_PATH, DATA_NAME, or GEOINFO_NAME are not set.")
    sys.exit(1)

# Construct full file paths
DATA_PATH = os.path.join(DATA_FOLDER_PATH, DATA_NAME + '.parquet')
GEOINFO_PATH = os.path.join(DATA_FOLDER_PATH, GEOINFO_NAME + '.geojson')

# Load datasets
try:
    # Read Parquet file with pandas
    df = pd.read_parquet(DATA_PATH)
    logging.info(f"Data loaded successfully from {DATA_PATH}")
except Exception as e:
    logging.error(f"Error loading data from {DATA_PATH}: {e}")
    sys.exit(1)

try:
    # Read GeoJSON file with geopandas    
    geoinfo = gpd.read_file(GEOINFO_PATH)
    logging.info(f"Geoinfo loaded successfully from {GEOINFO_PATH}")
except Exception as e:
    logging.error(f"Error loading geoinfo from {GEOINFO_PATH}: {e}")
    sys.exit(1)

INFO:root:Data loaded successfully from /mnt/c/Users/Pooya/Dropbox/IRIMO/Export/Iran_Data_1982_2023/Iran_Data_1982_2023.parquet
INFO:root:Geoinfo loaded successfully from /mnt/c/Users/Pooya/Dropbox/IRIMO/Export/Iran_Data_1982_2023/Iran_GeoInfo_1982_2023.geojson


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3613873 entries, 0 to 3613872
Data columns (total 55 columns):
 #   Column             Dtype         
---  ------             -----         
 0   station_id         category      
 1   station_name       category      
 2   region_id          category      
 3   region_name        category      
 4   lat                float64       
 5   lon                float64       
 6   station_elevation  float64       
 7   date               datetime64[ns]
 8   vvmin              float64       
 9   ff_max             float64       
 10  ffm                float64       
 11  ff_gust_max        float64       
 12  dd_gust_max        float64       
 13  tmax               float64       
 14  tmin               float64       
 15  tm                 float64       
 16  pmin               float64       
 17  pm                 float64       
 18  p0m                float64       
 19  ewm                float64       
 20  dj18               float

In [10]:
selected_columns = [
    'station_id',
    'station_name',
    'region_id',
    'region_name',
    'lat',
    'lon',
    'station_elevation',
    'date',
    'tmax',
    'tmin',
    'tm',
    'umax',
    'umin',
    'um',
    'ffm',
    'sshn',
    'rrr24',
]


sample = df[selected_columns].copy()
sample.dropna(subset=selected_columns, inplace=True)

In [11]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2637122 entries, 90 to 3613804
Data columns (total 17 columns):
 #   Column             Dtype         
---  ------             -----         
 0   station_id         category      
 1   station_name       category      
 2   region_id          category      
 3   region_name        category      
 4   lat                float64       
 5   lon                float64       
 6   station_elevation  float64       
 7   date               datetime64[ns]
 8   tmax               float64       
 9   tmin               float64       
 10  tm                 float64       
 11  umax               float64       
 12  umin               float64       
 13  um                 float64       
 14  ffm                float64       
 15  sshn               float64       
 16  rrr24              float64       
dtypes: category(4), datetime64[ns](1), float64(12)
memory usage: 296.8 MB


In [12]:
# Calculate number of records per station`
station_record_counts = sample['station_id'].value_counts().reset_index()

station_record_counts['year'] = station_record_counts["count"] // 365

station_record_counts


Unnamed: 0,station_id,count,year
0,40706,20904,57
1,40712,19385,53
2,40858,18258,50
3,40703,17203,47
4,40738,15581,42
...,...,...,...
443,19958,0,0
444,40884,0,0
445,40885,0,0
446,88102,0,0
