In [1]:
import os
import glob
import pandas as pd
import random

In [4]:
# Find all CSV files in the datasets/daep/ directory and subdirectories
csv_files = glob.glob('../datasets/daep/**/*.csv', recursive=True)

# Create a list to store the results
results = []

# For each CSV file, get the file name and record count
for file_path in csv_files:
    file_name = os.path.basename(file_path)
    # Read the CSV file to get the record count
    df = pd.read_csv(file_path)
    record_count = len(df)
    results.append({'csv_name': file_name, 'record_count': record_count})
    
    # Convert Timestamp column to datetime if it exists
    if 'Timestamp' in df.columns:
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    
    # Save as pickle file with same name but .pkl extension
    pickle_file = os.path.join('../pkl', file_name.replace('.csv', '.pkl'))
    os.makedirs(os.path.dirname(pickle_file), exist_ok=True)
    df.to_pickle(pickle_file)

csv_df = pd.DataFrame(results)

# Add a column with the pickle file name
csv_df['pkl_name'] = csv_df['csv_name'].str.replace('.csv', '.pkl')

display(csv_df)

Unnamed: 0,csv_name,record_count,pkl_name
0,Additional_Information_by_Country.csv,49,Additional_Information_by_Country.pkl
1,Australia_AEMO_2006.csv,17519,Australia_AEMO_2006.pkl
2,Australia_AEMO_2007.csv,17520,Australia_AEMO_2007.pkl
3,Australia_AEMO_2008.csv,17568,Australia_AEMO_2008.pkl
4,Australia_AEMO_2009.csv,17520,Australia_AEMO_2009.pkl
...,...,...,...
501,USA_SPP_2021.csv,8759,USA_SPP_2021.pkl
502,USA_SPP_2022.csv,8759,USA_SPP_2022.pkl
503,USA_SPP_2023.csv,8759,USA_SPP_2023.pkl
504,USA_SPP_2024.csv,8778,USA_SPP_2024.pkl


In [5]:
csv_df.to_pickle('../pkl/files_list.pkl')

In [12]:
sample_size = 10
start_idx = random.randint(0, len(csv_df) - sample_size)
display(csv_df.iloc[start_idx:start_idx + sample_size])

Unnamed: 0,csv_name,record_count,pkl_name
142,France_ENTSO-E_2020.csv,8784,France_ENTSO-E_2020.pkl
143,France_ENTSO-E_2021.csv,8760,France_ENTSO-E_2021.pkl
144,France_ENTSO-E_2022.csv,8760,France_ENTSO-E_2022.pkl
145,France_ENTSO-E_2023.csv,8760,France_ENTSO-E_2023.pkl
146,France_ENTSO-E_2024.csv,6672,France_ENTSO-E_2024.pkl
147,Germany_ENTSO-E_2015.csv,8664,Germany_ENTSO-E_2015.pkl
148,Germany_ENTSO-E_2016.csv,8784,Germany_ENTSO-E_2016.pkl
149,Germany_ENTSO-E_2017.csv,8760,Germany_ENTSO-E_2017.pkl
150,Germany_ENTSO-E_2018.csv,15384,Germany_ENTSO-E_2018.pkl
151,Germany_ENTSO-E_2019.csv,35040,Germany_ENTSO-E_2019.pkl
