### Assumptions for all CSV files making a data set:  
* files consist of 1-day measurements with `1[s]` resolution (starting earliest at `00:00:00`, ending `23:59:59` latest),
* values should be `comma separated`,
* files share the same name prefix (i.e. `analysis`),
* file names end with a 6-digit DATE indicating day of measurements using `YYMMDD` format (i.e. `230517` for 2023-05-17),
* file name consist of a `prefix` followed by the `date` with `.csv` extension at the end (i.e. `analysis230517.csv`),
* comment lines start with a `#` sign,
* file should include a header row containing column names,
* column with measurement timestamp should be in a datetime UTC format (i.e. `2023-05-17T01:15:27Z`),
* column with frequency measurement value should be a float number expressed in [Hz] (i.e. `224999.865`).


In [None]:
# INPUTS #

data_filename_prefix = 'analysis'
data_time_column_name = 'UTC'
data_frequency_column_name = 'Freq'

data_folder = 'data'
output_folder = 'output'

In [None]:
# ADDITIONAL MODULES AND DEPENDENCIES INSTALLATION #

!{sys.executable} -m pip install pandas

In [None]:
# IMPORTS #

from datetime import datetime
from pathlib import Path
import pandas as pd

In [None]:
# CSV FILE LIST CREATION #

data_filename_suffix = '[0-9][0-9][0-9][0-9][0-9][0-9].csv'
data_set_file_paths = sorted(Path(Path.cwd().parent / data_folder).glob(data_filename_prefix + data_filename_suffix))

print("Data set is made of " + str(len(data_set_file_paths)) + " files.")

In [None]:
# LIST OF DATE-PATH TUPLES CREATION #

date_path = []
for path in data_set_file_paths:
    file_name = Path(path).name
    date_str = file_name[-10:-4]
    date = datetime.strptime(date_str, '%y%m%d')
    date_path.append((date, path))

In [None]:
# LOAD ITERATIVELY THE DATA FROM CSV FILES "

data = pd.DataFrame(columns=[data_time_column_name, data_frequency_column_name])

for _, path in date_path:
    new_data = pd.read_csv(path, comment='#', usecols=[data_time_column_name, data_frequency_column_name], parse_dates=[data_time_column_name], dtype={data_frequency_column_name: float})
    data = pd.concat([data, new_data], axis=0)

data.set_index(data_time_column_name, inplace=True)
data_amount = len(data.index)

print("CSV files provided " + str(data_amount) + " records for the dates range: " + str(data.index.min()) + " - " + str(data.index.max()))

In [None]:
# RESAMPLE DATAPOINTS TO 1[s] INTERVAL #

resampled_data = data.resample('1S').interpolate()
start_time = data.index[0]
end_time = data.index[-1]
data_points_times = pd.date_range(start_time, end_time, freq='1S')
resampled_data = resampled_data.reindex(data_points_times)
resampled_data_amount = len(resampled_data.index)

print("Loaded data was resampled (just in case) to 1[s] and all the gaps in measurements were filled using interpolation.")
print("Ended up with " + str(resampled_data_amount) + " records (" + str(resampled_data_amount - data_amount) + " more).")



In [None]:
#https://gist.github.com/salticus/a462912dfff90c9bded954c48f916f64
#https://docs.python.org/3/library/glob.html