In [17]:
import pandas as pd
from typing import List, Tuple, Dict, Any
import numpy as np
from datetime import datetime, timedelta
import os
import sys
import matplotlib.pyplot as plt

In [18]:
dataset_name = "air_quality_kdd_2018"

In [19]:
input_fname = "kdd_cup_2018_dataset_without_missing_values.tsf"
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_fig_fname = os.path.join(output_dir, f'{dataset_name}.png')

In [20]:
def data_line_generator(filepath: str):
    """
    Generator function that yields each line of data after the '@data' marker in the file.

    Args:
    filepath (str): The path to the file from which to read the data.

    Yields:
    str: The next line of data from the file.
    """
    with open(filepath, 'r') as file:
        # Iterate over the file until the '@data' marker is found
        for line in file:
            if line.strip() == "@data":
                break

        # Yield each line of data after the '@data' marker
        for line in file:
            if not line.strip():  # Skip any empty lines that may be present
                continue
            yield line.strip()

In [27]:
def generate_timestamps(start_timestamp_str: str, time_steps: int) -> List[str]:
    """
    Generates a list of hourly timestamps starting with the given timestamp and incrementing by 1 hour.

    Args:
        start_timestamp_str (str): The starting timestamp in string format 'YYYY-MM-DD HH-MM-SS'.
        time_steps (int): The number of hourly time steps to generate.

    Returns:
        List[str]: A list of timestamps in string format incremented hourly.
    """
    # Convert the start timestamp string to a datetime object
    start_timestamp = datetime.strptime(start_timestamp_str, "%Y-%m-%d %H-%M-%S")
    
    # Generate a list of timestamps
    timestamps = [start_timestamp + timedelta(hours=i) for i in range(time_steps)]
    
    # Convert the timestamps back to strings
    timestamp_strs = [timestamp.strftime("%Y-%m-%d %H:%M:%S") for timestamp in timestamps]
    
    return timestamp_strs

# Test the function with a start timestamp and 5 time steps
test_timestamps = generate_timestamps("2017-01-01 14-00-00", 5)
test_timestamps

['2017-01-01 14:00:00',
 '2017-01-01 15:00:00',
 '2017-01-01 16:00:00',
 '2017-01-01 17:00:00',
 '2017-01-01 18:00:00']

In [28]:
data_gen = data_line_generator(input_fname)

def process_data():
    # Print the first few lines to ensure the generator is working as expected
    df_list = []
    timestamp_lookup_dict = {}
    for i in range(500):
        try:
            line = next(data_gen)
            fields = line.split(":")
            # we will only use Beijing data. London data is of different length than Beijing
            if fields[1] != 'Beijing': 
                continue
            num_time_steps = len(fields[-1].split(","))
            timestamp_lookup = (fields[4], num_time_steps)
            if timestamp_lookup not in timestamp_lookup_dict:
                timestamp_lookup_dict[timestamp_lookup] = generate_timestamps(*timestamp_lookup)
            timestamps_list = timestamp_lookup_dict[timestamp_lookup]
            #print(i, fields[:5], timestamp_lookup, len(timestamps_list))
            df = pd.DataFrame({'timestamp': timestamps_list})
            df['station'] = fields[2]
            df['air_quality_measurement'] = fields[3]
            df['value'] = fields[-1].split(",")
            df['value'] = df['value'].astype(float)
            df_list.append(df)
#             print(df.shape)
#             sys.exit()
        except StopIteration: 
            break
        except Exception as e:
            # If any other exception occurs, print the error and the problematic line
            print("An error occurred:", e)
            print("Problematic line:", line)
    all_data = pd.concat(df_list)
    return all_data
        
all_data = process_data()
print(all_data.head())
print(all_data.shape)

0 ['T1', 'Beijing', 'aotizhongxin_aq', 'PM2.5', '2017-01-01 14-00-00'] ('2017-01-01 14-00-00', 10898) 10898


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# Pivot metrics so we can use some as exogenous
pivoted_data = all_data.pivot_table(
    index=['timestamp', 'station'],
    columns='air_quality_measurement',
    values='value', aggfunc=sum).reset_index()
print(pivoted_data.head())
print(pivoted_data.shape)

# Remove Series with Constant Values
Station **zhiwuyuan_aq** has constant values near the end of the series. This causes issue with metric calculations.
We will remove this series. 

In [None]:
pivoted_data = pivoted_data[pivoted_data['station'] != 'zhiwuyuan_aq']

In [None]:
series_col = "station"
epoch_col = 'timestamp'
epoch_label = "timestamp"
time_col = 'timestamp'
value_col = 'PM2.5'
exog_cols = ['PM10', 'NO2', 'CO', 'O3', 'SO2']

# Convert to Numeric

In [None]:
pivoted_data = pivoted_data.round({'PM2.5': 20, 'PM10': 2, 'NO2': 2, 'CO': 2, 'O3': 2, 'SO2': 2})

In [None]:
pivoted_data.head()

In [None]:
pivoted_data.info()

# Convert datetime to correct format. 

In [None]:
# Convert the datetime column to the desired string format
# pivoted_data[time_col] = pivoted_data[time_col].dt.strftime('%Y-%m-%d %H-%M-%S')
# pivoted_data.head()

# Save Main Data File

In [None]:
if time_col: 
    all_cols = [series_col, time_col, value_col] + exog_cols
else: 
    all_cols = [series_col, value_col] + exog_cols
    
pivoted_data.sort_values(by=[series_col, epoch_col], inplace=True)
pivoted_data[all_cols].to_csv(outp_fname, index=False, float_format="%.1f")

In [None]:
pivoted_data.tail()