This Jupyter Notebook contains code to convert .tsf files to .npz files and load them.
 
The main steps are:
1. Import necessary libraries and define the `convert_tsf_to_dataframe` function to read and convert .tsf files into pandas DataFrames.
2. Define a dictionary `files` that maps file numbers to their respective .tsf file names.
3. Define the `standardize_series` function to standardize the time series data.
4. Define the `process_and_save_tsf` function to process the .tsf files, standardize the series, and save them as .npz files.
5. Iterate over the `files` dictionary, process each .tsf file, and save the output as .npz files.
6. Define the `load_npz` function to load and print the contents of a .npz file.
7. Load and print the contents of the 'weather_dataset.npz' file.

In [None]:
from datetime import datetime
from distutils.util import strtobool

import numpy as np
import pandas as pd

def convert_tsf_to_dataframe(
    full_file_path_and_name,
    replace_missing_vals_with="NaN",
    value_column_name="series_value",
):
    col_names = []
    col_types = []
    all_data = {}
    line_count = 0
    frequency = None
    forecast_horizon = None
    contain_missing_values = None
    contain_equal_length = None
    found_data_tag = False
    found_data_section = False
    started_reading_data_section = False

    with open(full_file_path_and_name, "r", encoding="cp1252") as file:
        for line in file:
            # Strip white space from start/end of line
            line = line.strip()

            if line:
                if line.startswith("@"):  # Read meta-data
                    if not line.startswith("@data"):
                        line_content = line.split(" ")
                        if line.startswith("@attribute"):
                            if (
                                len(line_content) != 3
                            ):  # Attributes have both name and type
                                raise Exception("Invalid meta-data specification.")

                            col_names.append(line_content[1])
                            col_types.append(line_content[2])
                        else:
                            if (
                                len(line_content) != 2
                            ):  # Other meta-data have only values
                                raise Exception("Invalid meta-data specification.")

                            if line.startswith("@frequency"):
                                frequency = line_content[1]
                            elif line.startswith("@horizon"):
                                forecast_horizon = int(line_content[1])
                            elif line.startswith("@missing"):
                                contain_missing_values = bool(
                                    strtobool(line_content[1])
                                )
                            elif line.startswith("@equallength"):
                                contain_equal_length = bool(strtobool(line_content[1]))

                    else:
                        if len(col_names) == 0:
                            raise Exception(
                                "Missing attribute section. Attribute section must come before data."
                            )

                        found_data_tag = True
                elif not line.startswith("#"):
                    if len(col_names) == 0:
                        raise Exception(
                            "Missing attribute section. Attribute section must come before data."
                        )
                    elif not found_data_tag:
                        raise Exception("Missing @data tag.")
                    else:
                        if not started_reading_data_section:
                            started_reading_data_section = True
                            found_data_section = True
                            all_series = []

                            for col in col_names:
                                all_data[col] = []

                        full_info = line.split(":")

                        if len(full_info) != (len(col_names) + 1):
                            raise Exception("Missing attributes/values in series.")

                        series = full_info[len(full_info) - 1]
                        series = series.split(",")

                        if len(series) == 0:
                            raise Exception(
                                "A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series. Missing values should be indicated with ? symbol"
                            )

                        numeric_series = []

                        for val in series:
                            if val == "?":
                                numeric_series.append(replace_missing_vals_with)
                            else:
                                numeric_series.append(float(val))

                        if numeric_series.count(replace_missing_vals_with) == len(
                            numeric_series
                        ):
                            raise Exception(
                                "All series values are missing. A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series."
                            )

                        all_series.append(pd.Series(numeric_series).array)

                        for i in range(len(col_names)):
                            att_val = None
                            if col_types[i] == "numeric":
                                att_val = int(full_info[i])
                            elif col_types[i] == "string":
                                att_val = str(full_info[i])
                            elif col_types[i] == "date":
                                att_val = datetime.strptime(
                                    full_info[i], "%Y-%m-%d %H-%M-%S"
                                )
                            else:
                                raise Exception(
                                    "Invalid attribute type."
                                )  # Currently, the code supports only numeric, string and date types. Extend this as required.

                            if att_val is None:
                                raise Exception("Invalid attribute value.")
                            else:
                                all_data[col_names[i]].append(att_val)

                line_count = line_count + 1

        if line_count == 0:
            raise Exception("Empty file.")
        if len(col_names) == 0:
            raise Exception("Missing attribute section.")
        if not found_data_section:
            raise Exception("Missing series information under data section.")

        all_data[value_column_name] = all_series
        loaded_data = pd.DataFrame(all_data)
            
        return (
            loaded_data,
            frequency,
            forecast_horizon,
            contain_missing_values,
            contain_equal_length,
        )

In [None]:
files = {
    "1": "bitcoin_dataset_without_missing_values.tsf",
    "2": "covid_deaths_dataset.tsf",
    "3": "electricity_hourly_dataset.tsf",
    "4": "electricity_weekly_dataset.tsf",
    "5": "fred_md_dataset.tsf",
    "6": "hospital_dataset.tsf",
    "7": "kaggle_web_traffic_dataset_without_missing_values.tsf",
    "8": "kdd_cup_2018_dataset_without_missing_values.tsf",
    "9": "london_smart_meters_dataset_without_missing_values.tsf",
    "10": "nn5_daily_dataset_without_missing_values.tsf",
    "11": "oikolab_weather_dataset.tsf",
    "12": "pedestrian_counts_dataset.tsf",
    "13": "rideshare_dataset_without_missing_values.tsf",
    "14": "temperature_rain_dataset_without_missing_values.tsf",
    "15": "traffic_hourly_dataset.tsf",
    "16": "traffic_weekly_dataset.tsf",
    "17": "weather_dataset.tsf",
    "18": "wind_farms_minutely_dataset_without_missing_values.tsf"
}

In [None]:
def standardize_series(data):
    return (data - np.mean(data)) / (np.std(data, ddof=0) + 1e-7)

def process_and_save_tsf(file_path, output_path):
    df, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(
        file_path, replace_missing_vals_with="NaN", value_column_name="series_value"
    )

    df['series_value'] = df['series_value'].apply(standardize_series)
    
    npz_dict = {}
    for index, row in df.iterrows():
        npz_dict[row['series_name']] = row['series_value']
    np.savez(output_path, **npz_dict)

In [None]:
for file_number, file_name in files.items():
    file_path = f'YOUR/SOURCE/DATASETS/{file_name}'
    output_path = f'YOUR/TARGET/DATASETS/{file_name.replace(".tsf", ".npz")}'
    process_and_save_tsf(file_path, output_path)

In [None]:
def load_npz(file_path):
    with np.load(file_path) as data:
        print("Available arrays:", list(data.keys()))
        for key in data.keys():
            array = data[key]
            print(f"{key}: {array}")

file_path = 'weather_dataset.npz'
load_npz(file_path['T1'])