In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def generate_fake_data_series(
    length=200000,
    baseline=50,
    baseline_noise_std=0.1,
    section_noise_std=0.4,
    sin_noise_freqs=[10, 30, 50],
    sin_noise_amplitudes=[5, 5, 5],
    peak_vars=[4, 70, 8],
):

    time_points = np.linspace(0, length, length)

    baseline_noise = np.random.normal(0, baseline_noise_std, time_points.shape)

    if type(sin_noise_amplitudes) == list:
        baseline_sin_noise = sum(
            [
                sin_noise_amplitudes[i]
                * np.sin(2 * np.pi * time_points / sin_noise_freqs[i])
                for i in range(len(sin_noise_freqs))
            ]
        )
    else:
        baseline_sin_noise = sum(
            [
                sin_noise_amplitudes * np.sin(2 * np.pi * time_points / i)
                for i in range(len(sin_noise_freqs))
            ]
        )

    time_series = baseline + baseline_noise + baseline_sin_noise
    base_index = 1000
    labels = np.zeros(len(time_points))
    while base_index < len(time_points) - 5000:

        # print(base_index)
        distance_between_peaks = np.random.randint(500, 1000)
        shift = np.random.normal(0, 50)
        if base_index - int(shift) > 0:
            base_index -= int(shift)

        peak_size = np.random.randint(peak_vars[1] - 30, peak_vars[1] + 30)
        peak_std = np.random.randint(peak_vars[2] - 4, peak_vars[2] + 4)
        section_size = peak_size * 3 + distance_between_peaks + peak_size * 3

        peak_positions = [
            base_index + peak_size * 3,
            base_index + peak_size * 3 + distance_between_peaks,
        ]
        time_section = time_points[base_index : base_index + section_size]

        valley = -peak_vars[0] * np.exp(
            -0.5 * ((time_section - peak_positions[0]) / peak_std) ** 2
        )
        peak = peak_vars[0] * np.exp(
            -0.5 * ((time_section - peak_positions[1]) / peak_std) ** 2
        )

        time_series_section = valley + peak
        labels[
            int(peak_positions[0] - peak_std * 6) : int(
                peak_positions[0] + peak_std * 6
            )
        ] = 1
        labels[
            int(peak_positions[1] - peak_std * 6) : int(
                peak_positions[1] + peak_std * 6
            )
        ] = 1

        time_series[base_index : section_size + base_index] += time_series_section

        section_noise = np.random.normal(
            0, section_noise_std, distance_between_peaks - 2 * peak_size
        )

        time_series[
            peak_positions[0] + 1 * peak_size : peak_positions[1] - 1 * peak_size
        ] += section_noise

        base_index += section_size

    return time_series, labels

In [None]:
all_data = []
generation_vars = []
all_labels = []
for i in range(20):
    baseline_noise_std = np.random.uniform(0.05, 0.3)
    baseline = np.random.randint(10, 100)

    sin_noise_freqs = [np.random.randint(15, 25) * x for x in range(1, 6, 2)]
    sin_noise_amplitudes = [np.random.uniform(1, 4) for i in range(3)]
    # Amplitude, distance between peaks, peak std
    peak_vars = [
        np.random.uniform(6, 12),
        np.random.randint(50, 100),
        np.random.randint(8, 12),
    ]

    section_noise_std = np.random.uniform(0.3, 1)
    print(
        baseline,
        baseline_noise_std,
        section_noise_std,
        sin_noise_freqs,
        sin_noise_amplitudes,
        peak_vars,
    )
    generation_vars.append(
        [
            baseline,
            baseline_noise_std,
            section_noise_std,
            sin_noise_freqs,
            sin_noise_amplitudes,
            peak_vars,
        ]
    )
    data, labels = generate_fake_data_series(
        200000,
        baseline,
        baseline_noise_std,
        section_noise_std,
        sin_noise_freqs,
        sin_noise_amplitudes,
        peak_vars,
    )
    all_data.append(data)
    all_labels.append(labels)

In [None]:
for d in range(len(all_data)):
    # Plot the time series
    data = all_data[d]
    labels = all_labels[d]
    print(generation_vars[d])
    time_points = np.linspace(0, 200000, 200000)
    # plt.figure(figsize=(10, 6))
    # plt.plot(time_points, data, label='Time Series')
    # plt.xlabel('Time')
    # plt.ylabel('Value')
    # plt.title('Generated Time Series with Noisy Baseline and Gaussian Peaks/Valleys')
    # plt.legend()
    # plt.show()

    data_smooth = (
        pd.Series(data).rolling(100, center=True, min_periods=1).mean().to_numpy()
    )

    # plt.figure(figsize=(10, 6))
    # # plt.plot(time_points, time_series, label='Time Series')
    # plt.plot(time_points, data_smooth, label='Time Series Smooth')
    # plt.xlabel('Time')
    # plt.ylabel('Value')
    # plt.xlim(0,20000)
    # plt.show()

    plotLabels = np.full(len(data), np.mean(data_smooth))

    plotLabels[labels == 1] = np.mean(data_smooth) + 3

    plt.figure(figsize=(10, 6))
    plt.title(f"Generated data noise")
    # plt.plot(time_points, time_series, label='Time Series')
    plt.plot(time_points, data, label="Time Series")
    plt.plot(time_points, data_smooth, label="Time Series Smooth(100)")
    plt.plot(time_points, plotLabels, label="label")
    plt.legend()
    plt.xlabel("Time")
    plt.ylabel("Value")
    plt.xlim(0, 5000)
    plt.show()

In [None]:
directory = "data/labeled_data/"
from pathlib import Path

Path(directory).mkdir(parents=True, exist_ok=True)
fileNames = []
for d in range(len(all_data)):
    data = all_data[d]
    labels = all_labels[d]
    # data_dict={"data":data.tolist(),"labels":labels.tolist(),"generation_vars":generation_var}
    labeled_data = np.stack((np.array(data), np.array(labels)))
    print(labeled_data.shape)
    np.savetxt(directory + f"generate_data{d}_labeled.csv", labeled_data, delimiter=",")

    print(f"Saved generated data to generate_data{d}_labeled.csv")
    fileNames.append("labeled_data/generate_data" + str(d) + "_labeled.csv")

with open("data/labeled_data_fileNames.txt", "w") as output:
    for file in fileNames:
        output.write(file + "\n")