In [1]:
import numpy as np
import glob, os, sys
import copy
import pandas as pd
import datetime

In [2]:
def show_progress_bar(iteration, total, prefix='', suffix='', length=50, fill='█'):
    """
    Display a progress bar in the terminal.

    Parameters:
    - iteration (int): Current iteration (starting from 0).
    - total (int): Total number of iterations.
    - prefix (str): Prefix string (e.g., 'Progress:').
    - suffix (str): Suffix string (e.g., 'Complete').
    - length (int): Character length of the progress bar.
    - fill (str): Character to represent progress.
    """
    iteration += 1
    percent = f"{100 * (iteration / float(total)):.1f}"  # Calculate progress percentage
    filled_length = int(length * iteration // total)    # Calculate the filled length
    bar = fill * filled_length + '-' * (length - filled_length)  # Construct the bar
    # Print the bar
    sys.stdout.write(f'\r{prefix} |{bar}| {percent}% {suffix}')
    sys.stdout.flush()
    # Print a new line when the process is complete
    if iteration == total:
        print()

In [3]:
def read_correct_dates(rmm_file_name):
    nanalysis2write = 40
    nforecasts = 7
    date = os.path.basename(rmm_file_name).split('.')[2]
    df = pd.read_csv(rmm_file_name)

    # Define the 40th date
    target_date = pd.to_datetime(date, format="%Y%m%d")
    
    # Generate 47 continuous dates ensuring 40th element is "20240706"
    date_range = pd.date_range(start=target_date - pd.Timedelta(days=nanalysis2write-1), periods=nanalysis2write+nforecasts, freq="D")
    
    # Convert to YYYYMMDD format
    date_list = date_range.strftime("%Y%m%d").tolist()
    # Extract year, month, and day from date_list
    df["year"] = [int(date[:4]) for date in date_list]
    df["month"] = [int(date[4:6]) for date in date_list]
    df["day"] = [int(date[6:]) for date in date_list]
    
    # Convert 'year', 'month', 'day' to datetime (keep as datetime, not string)
    df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
    return df
    

In [4]:
date = datetime.datetime(2024, 12, 1)
date_label = date.strftime("%Y%m%d")
date_label

'20241201'

In [29]:
# Generate date range from 2023-09-01 to 2025-03-10
date_range = pd.date_range(start="2023-09-01", end="2025-03-10", freq="D")
date_range

DatetimeIndex(['2023-09-01', '2023-09-02', '2023-09-03', '2023-09-04',
               '2023-09-05', '2023-09-06', '2023-09-07', '2023-09-08',
               '2023-09-09', '2023-09-10',
               ...
               '2025-03-01', '2025-03-02', '2025-03-03', '2025-03-04',
               '2025-03-05', '2025-03-06', '2025-03-07', '2025-03-08',
               '2025-03-09', '2025-03-10'],
              dtype='datetime64[ns]', length=557, freq='D')

In [5]:
data_dir = "/data/users/prince.xavier/SALMON/processed_SEA_data/mogreps/mjo/rmms/"
mogreps_mjo_archive_dir = os.path.join(data_dir, date_label)
mogreps_mjo_archive_dir

'/data/users/prince.xavier/SALMON/processed_SEA_data/mogreps/mjo/rmms/20241201'

In [6]:
rmm_file_names = glob.glob(os.path.join(data_dir, '*/createdPCs.15sn*.txt'))
rmm_file_names.sort()
rmm_file_names

In [12]:
# Initialize an empty list to store DataFrames
all_analysis_dfs = []
all_fcast_dfs = []
# Loop through each file
for i, file in enumerate(rmm_file_names):
    show_progress_bar(i+1, len(rmm_file_names))
    try:
        df = read_correct_dates(file)
        # Filter analysis data
        analysis_df = df.loc[df['label'] == 'analysis'].copy()  # Use .copy() to avoid SettingWithCopyWarning
        # Add lead column
        # Append to list
        all_analysis_dfs.append(analysis_df)

        # Filter forecast data
        fcast_df = df.loc[df['label'] == 'forecast'].copy()  # Use .copy() to avoid SettingWithCopyWarning
        # Add lead column
        fcast_df['lead'] = range(1, len(fcast_df) + 1)  
        # Append to list
        all_fcast_dfs.append(fcast_df)
    except:
        print(f'Problem in {file}')
        pass
print(len([df for df in all_analysis_dfs if df.empty]))
all_analysis_dfs = pd.concat(all_analysis_dfs, ignore_index=True)
all_fcast_dfs = pd.concat(all_fcast_dfs, ignore_index=True)

 |██████████████████████████████████████████████████| 100.0% 
 |██████████████████████████████████████████████████| 100.0% 0


In [13]:
all_analysis_dfs = all_analysis_dfs.drop_duplicates(subset=['date'], keep='first').reset_index(drop=True)

In [14]:
all_analysis_dfs

Unnamed: 0,year,month,day,rmm1,rmm2,phase,amp,label,date
0,2023,6,4,-1.154809,-1.090104,1,1.588052,analysis,2023-06-04
1,2023,6,5,-1.138868,-1.444139,2,1.839173,analysis,2023-06-05
2,2023,6,6,-0.962857,-1.840564,2,2.077202,analysis,2023-06-06
3,2023,6,7,-0.711015,-2.088823,2,2.206518,analysis,2023-06-07
4,2023,6,8,-0.137634,-2.044666,2,2.049293,analysis,2023-06-08
...,...,...,...,...,...,...,...,...,...
532,2025,3,14,-0.712349,-1.747279,2,1.886909,analysis,2025-03-14
533,2025,3,15,-0.672843,-1.404093,2,1.556983,analysis,2025-03-15
534,2025,3,16,-0.120758,-1.226622,2,1.232552,analysis,2025-03-16
535,2025,3,17,0.397550,-1.246329,3,1.308198,analysis,2025-03-17


In [15]:
all_analysis_dfs.loc[all_analysis_dfs.date == '2024-01-01']

Unnamed: 0,year,month,day,rmm1,rmm2,phase,amp,label,date
94,2024,1,1,0.258919,-1.175519,3,1.203696,analysis,2024-01-01


In [18]:
all_fcast_dfs.iloc[:10]

Unnamed: 0,year,month,day,rmm1,rmm2,phase,amp,label,date,lead
0,2023,7,14,1.22213,-1.142745,4,1.673161,forecast,2023-07-14,1
1,2023,7,15,1.303424,-0.852376,4,1.557388,forecast,2023-07-15,2
2,2023,7,16,1.527604,-0.492205,4,1.604943,forecast,2023-07-16,3
3,2023,7,17,1.487815,-0.529729,4,1.579305,forecast,2023-07-17,4
4,2023,7,18,1.514093,-0.37167,4,1.559043,forecast,2023-07-18,5
5,2023,7,19,1.370675,-0.115577,4,1.37554,forecast,2023-07-19,6
6,2023,7,20,1.208573,0.127391,5,1.215269,forecast,2023-07-20,7
7,2023,7,14,1.157585,-1.434117,3,1.843013,forecast,2023-07-14,1
8,2023,7,15,1.474871,-1.049398,4,1.810106,forecast,2023-07-15,2
9,2023,7,16,1.703644,-0.799136,4,1.88176,forecast,2023-07-16,3


In [34]:
# Filter all_analysis_dfs to keep only rows where 'date' exists in fc_dfs['date']
lead_time = 4
fc_dfs = all_fcast_dfs.loc[(all_fcast_dfs.lead == lead_time)]
print(len(fc_dfs))
matched_analysis_df = []
cols = all_analysis_dfs.columns
for index, row in fc_dfs.iterrows():
    mdf = all_analysis_dfs.loc[all_analysis_dfs['date'] == row['date']]
    if mdf.empty:
        mdf[cols] = None
    matched_analysis_df.append(mdf)
    
print(len(matched_analysis_df))

14364
14364


In [35]:
matched_analysis_df = pd.concat(matched_analysis_df, keys=range(len(matched_analysis_df)), ignore_index=True)

In [37]:
len(matched_analysis_df), len(fc_dfs)
#matched_analysis_df.iloc[10], fc_dfs.iloc[10]

(14148, 14364)

In [110]:
# day 3 forecast
t = 2
fc_data = fcast_df.iloc[t]
fc_date = pd.to_datetime(fc_data['date'])
fc_data

year           2024
month            12
day               3
rmm1       2.199826
rmm2       0.955104
phase             5
amp         2.39822
label      forecast
date     2024-12-03
Name: 41, dtype: object

In [111]:
an_file_name = glob.glob(os.path.join(data_dir, fc_date.strftime('%Y%m%d'), '*.txt'))[0]
an_file_name
adf = pd.read_csv(an_file_name)
adf['date'] = pd.to_datetime(adf[['year', 'month', 'day']]).dt.strftime('%Y-%m-%d')

In [112]:
adf.loc[pd.to_datetime(adf['date'], format='%Y-%m-%d') == pd.to_datetime(fc_date, format='%Y-%m-%d')]

Unnamed: 0,year,month,day,rmm1,rmm2,phase,amp,label,date
39,2024,12,3,2.453316,0.843059,5,2.59413,analysis,2024-12-03
