# Load Libraries

In [1]:
import os
import warnings
import logging
import sys
import pickle
import numpy as np
import pandas as pd
import geopandas as gpd
import dotenv

# Load environment variables from .env file
dotenv.load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)

# Suppress warnings
warnings.filterwarnings("ignore")

# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_colwidth', None)

# Load Data

In [2]:
# Define file paths using environment variables
DATA_FOLDER_PATH = os.getenv('DATA_FOLDER_PATH')
DATA_NAME = os.getenv('DATA_NAME')
GEOINFO_NAME = os.getenv('GEOINFO_NAME')

if not DATA_FOLDER_PATH or not DATA_NAME or not GEOINFO_NAME:
    logging.error("Environment variables DATA_FOLDER_PATH, DATA_NAME, or GEOINFO_NAME are not set.")
    sys.exit(1)

# Construct full file paths
DATA_PATH = os.path.join(DATA_FOLDER_PATH, DATA_NAME + '.parquet')
GEOINFO_PATH = os.path.join(DATA_FOLDER_PATH, GEOINFO_NAME + '.geojson')

# Load datasets
try:
    # Read Parquet file with pandas
    data = pd.read_parquet(DATA_PATH)
    logging.info(f"Data loaded successfully from {DATA_PATH}")
except Exception as e:
    logging.error(f"Error loading data from {DATA_PATH}: {e}")
    sys.exit(1)

try:
    # Read GeoJSON file with geopandas    
    geoinfo = gpd.read_file(GEOINFO_PATH)
    logging.info(f"Geoinfo loaded successfully from {GEOINFO_PATH}")
except Exception as e:
    logging.error(f"Error loading geoinfo from {GEOINFO_PATH}: {e}")
    sys.exit(1)

INFO:root:Data loaded successfully from /mnt/c/Users/Pooya/Dropbox/IRIMO/Export/Khorasan_Data_1950_2025/Khorasan_Data_1950_2025.parquet
INFO:root:Geoinfo loaded successfully from /mnt/c/Users/Pooya/Dropbox/IRIMO/Export/Khorasan_Data_1950_2025/Khorasan_GeoInfo.geojson


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531359 entries, 0 to 531358
Data columns (total 54 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   station_id         531359 non-null  category      
 1   station_name       531359 non-null  category      
 2   region_id          531359 non-null  category      
 3   region_name        531359 non-null  category      
 4   lat                531359 non-null  float64       
 5   lon                531359 non-null  float64       
 6   station_elevation  531359 non-null  float64       
 7   date               531359 non-null  datetime64[ns]
 8   vvmin              375407 non-null  float64       
 9   ff_max             398375 non-null  float64       
 10  dd_max             398288 non-null  float64       
 11  ffm                398375 non-null  float64       
 12  ff_gust_max        17687 non-null   float64       
 13  dd_gust_max        17669 non-null   float64 

# Filter Data

In [4]:
selected_columns = [
    'station_id',
    'station_name',
    'region_id',
    'region_name',
    'lat',
    'lon',
    'station_elevation',
    'date',
    'tmax',
    'tmin',
    'tm',
    'umax',
    'umin',
    'um',
    'ffm',
    'sshn',
    'rrr24',
]

df = data[selected_columns].copy()
# df.dropna(subset=selected_columns, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531359 entries, 0 to 531358
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   station_id         531359 non-null  category      
 1   station_name       531359 non-null  category      
 2   region_id          531359 non-null  category      
 3   region_name        531359 non-null  category      
 4   lat                531359 non-null  float64       
 5   lon                531359 non-null  float64       
 6   station_elevation  531359 non-null  float64       
 7   date               531359 non-null  datetime64[ns]
 8   tmax               522407 non-null  float64       
 9   tmin               511729 non-null  float64       
 10  tm                 506243 non-null  float64       
 11  umax               394479 non-null  float64       
 12  umin               394473 non-null  float64       
 13  um                 394479 non-null  float64 

# Filter Data

In [6]:
# df.query('date >= "1983-01-01" and date <= "2022-12-31"', inplace=True)
df.reset_index(drop=True, inplace=True)

In [7]:
# Calculate number of records per station`
station_record_counts = df['station_id'].value_counts().reset_index()

station_record_counts['year'] = station_record_counts["count"] // 365

station_record_counts


Unnamed: 0,station_id,count,year
0,40745,27272,74
1,40743,25902,70
2,40809,25628,70
3,40762,24294,66
4,40791,21758,59
...,...,...,...
78,19062,243,0
79,18606,235,0
80,18604,228,0
81,51202,101,0


# Export Data

In [8]:
# To pickle in data folder
with open('../../data/Khorasan_Data_1950_2025.pkl', 'wb') as f:
    pickle.dump(df, f)