In [42]:
import pandas as pd
from typing import List, Tuple, Dict, Any
import numpy as np
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import os
import sys
import random
import matplotlib.pyplot as plt

In [43]:
dataset_name = "m4_yearly_sampled_series"

In [44]:
input_fname = "m4_yearly_dataset.tsf"
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_fig_fname = os.path.join(output_dir, f'{dataset_name}.png')

# Read Data

In [45]:
def data_line_generator(filepath: str):
    """
    Generator function that yields each line of data after the '@data' marker in the file.

    Args:
    filepath (str): The path to the file from which to read the data.

    Yields:
    str: The next line of data from the file.
    """
    with open(filepath, 'r') as file:
        # Iterate over the file until the '@data' marker is found
        for line in file:
            if line.strip() == "@data":
                break

        # Yield each line of data after the '@data' marker
        for line in file:
            if not line.strip():  # Skip any empty lines that may be present
                continue
            yield line.strip()

In [46]:
def generate_timestamps(start_timestamp_str: str, time_steps: int) -> List[str]:
    """
    Generates a list of yearly timestamps starting with the given timestamp and incrementing by 1 year. 

    Args:
        start_timestamp_str (str): The starting timestamp in string format 'YYYY-MM-DD HH-MM-SS'.
        time_steps (int): The number of yearly time steps to generate.

    Returns:
        List[str]: A list of timestamps in string format incremented yearly.
    """
    # Convert the start timestamp string to a datetime object
    start_timestamp = datetime.strptime(start_timestamp_str, "%Y-%m-%d %H-%M-%S")
    
    # Generate a list of timestamps
    timestamps = [start_timestamp + relativedelta(years=i) for i in range(time_steps)]
    
    # Convert the timestamps back to strings
    timestamp_strs = [timestamp.strftime("%Y-%m-%d %H-%M-%S") for timestamp in timestamps]
    
    return timestamp_strs

# Test the function with a start timestamp and 5 time steps
test_timestamps = generate_timestamps("1979-01-01 12-00-00", 5)
test_timestamps

['1979-01-01 12-00-00',
 '1980-01-01 12-00-00',
 '1981-01-01 12-00-00',
 '1982-01-01 12-00-00',
 '1983-01-01 12-00-00']

# Find most common start and end years of series
Each series in the m4 yearly dataset has it's own start year and end year. We want to find the start-end combinations which have the most number of series. We will pick from that list of series. 

In [47]:
data_gen = data_line_generator(input_fname)

def find_counts_for_series_start_year_and_length(max_series=10000):
    timestamp_lookup_dict = {}
    for i in range(max_series):
        try:
            line = next(data_gen)
            fields = line.split(":")
            series_start_datetime = fields[1]
            num_time_steps = len(fields[-1].split(","))
            timestamp_lookup = (series_start_datetime, num_time_steps)
            if timestamp_lookup not in timestamp_lookup_dict:
                # timestamp_lookup_dict[timestamp_lookup] = generate_timestamps(*timestamp_lookup)
                timestamp_lookup_dict[timestamp_lookup] = 1
            else: 
                timestamp_lookup_dict[timestamp_lookup] += 1
        except StopIteration: 
            break
        except Exception as e:
            # If any other exception occurs, print the error and the problematic line
            print("An error occurred:", e)
            print("Problematic line:", line)
    return timestamp_lookup_dict

timestamp_lookup_dict = find_counts_for_series_start_year_and_length(max_series=10000)

In [48]:
max_count = max(timestamp_lookup_dict.values())
max_count
# Some start-end year combination occurs 2237 times

2237

In [49]:
# Find the start year for this (there may be multiple, so using a list)
max_count_tuple = [k for k, v in timestamp_lookup_dict.items() if v == max_count]
max_count_tuple

[('1969-01-01 12-00-00', 46)]

In [50]:
max_count_start_year = max_count_tuple[0][0][:4]
max_count_num_steps = max_count_tuple[0][1]
print(max_count_start_year, max_count_num_steps)

1969 46


We will now filter data to contain only series which start in the year 1969, and which have length 46 (i.e. they end in the year 2014). 

## Filter Series for start 1969 and end 2014

In [51]:
data_gen = data_line_generator(input_fname)

def process_data(max_series):
    df_list = []
    timestamp_lookup_dict = {}
    for i in range(max_series):
        try:
            line = next(data_gen)
            fields = line.split(":")
            series_name = fields[0]
            series_start_datetime = fields[1]
            series_vals = fields[-1].split(",")
            num_time_steps = len(series_vals)
            
            # filter for target start year and num of years
            if series_start_datetime[:4] != max_count_start_year or num_time_steps != max_count_num_steps:
                continue
                            
            timestamp_lookup = (series_start_datetime, num_time_steps)
            if timestamp_lookup not in timestamp_lookup_dict:
                timestamp_lookup_dict[timestamp_lookup] = generate_timestamps(*timestamp_lookup)
            timestamps_list = timestamp_lookup_dict[timestamp_lookup]
# #             print(i, fields[:5], timestamp_lookup, len(timestamps_list))
            df = pd.DataFrame({'year': timestamps_list})
            df['series'] = series_name
            df['value'] = series_vals
            df['value'] = df['value'].astype(float)
            df_list.append(df)
#             print(df)
#             sys.exit()
        except StopIteration: 
            break
        except Exception as e:
            # If any other exception occurs, print the error and the problematic line
            print("An error occurred:", e)
            print("Problematic line:", line)
    all_data = pd.concat(df_list)
    return all_data
        
all_data = process_data(max_series=10000)
print(all_data.head())
print(all_data.shape)

                  year series  value
0  1969-01-01 12-00-00   T825  366.8
1  1970-01-01 12-00-00   T825  369.8
2  1971-01-01 12-00-00   T825  382.9
3  1972-01-01 12-00-00   T825  417.4
4  1973-01-01 12-00-00   T825  459.0
(102948, 3)


In [52]:
all_data['series'].value_counts().max()

46

# Sample 100 Series

In [53]:
unique_series = all_data['series'].unique().tolist()
print(len(unique_series))
print(unique_series[-10:])

2238
['T7531', 'T7532', 'T7533', 'T7534', 'T7535', 'T7536', 'T7537', 'T7538', 'T7539', 'T7540']


In [54]:
random.seed(123)
sampled_series = random.sample(unique_series, 100)
print(len(sampled_series))
print(sampled_series[-10:])

100
['T4656', 'T4362', 'T5178', 'T6992', 'T5481', 'T5881', 'T5953', 'T6867', 'T6806', 'T842']


# Final Data

In [55]:
# Filter to the sample
final_data = all_data[all_data['series'].isin(sampled_series)].reset_index(drop=True)

In [56]:
# Convert year to 'YYYY-MM-DD' form
final_data['year'] = final_data['year'].apply(lambda s: s[:10])

In [57]:
final_data['year'].unique()

array(['1969-01-01', '1970-01-01', '1971-01-01', '1972-01-01',
       '1973-01-01', '1974-01-01', '1975-01-01', '1976-01-01',
       '1977-01-01', '1978-01-01', '1979-01-01', '1980-01-01',
       '1981-01-01', '1982-01-01', '1983-01-01', '1984-01-01',
       '1985-01-01', '1986-01-01', '1987-01-01', '1988-01-01',
       '1989-01-01', '1990-01-01', '1991-01-01', '1992-01-01',
       '1993-01-01', '1994-01-01', '1995-01-01', '1996-01-01',
       '1997-01-01', '1998-01-01', '1999-01-01', '2000-01-01',
       '2001-01-01', '2002-01-01', '2003-01-01', '2004-01-01',
       '2005-01-01', '2006-01-01', '2007-01-01', '2008-01-01',
       '2009-01-01', '2010-01-01', '2011-01-01', '2012-01-01',
       '2013-01-01', '2014-01-01'], dtype=object)

In [58]:
series_col = "series"
epoch_col = 'epoch'
epoch_label = "year_num"
time_col = 'year'
value_col = 'value'
exog_cols = []

# Add Epochs

In [59]:
if epoch_col not in final_data: 
    unique_times = sorted(final_data[time_col].drop_duplicates().tolist())
    times_to_epoch = {
        t: i for i, t in enumerate(unique_times)
    }
    #print(yrmt_to_epoch)
    final_data.insert(1, epoch_col, final_data[time_col].map(times_to_epoch))
final_data.head()

Unnamed: 0,year,epoch,series,value
0,1969-01-01,0,T842,205.0142
1,1970-01-01,1,T842,231.7463
2,1971-01-01,2,T842,255.7048
3,1972-01-01,3,T842,277.5605
4,1973-01-01,4,T842,301.3527


# Save Main Data File

In [60]:
all_cols = [series_col, epoch_col, value_col] + exog_cols

In [61]:
final_data[all_cols].to_csv(outp_fname, index=False)