In [1]:
import numpy as np
import pandas as pd
import json
import random

In [2]:
# --- Step 1: Load original CSV and parse items ---
df = pd.read_csv("100_thermal_data.csv")
df['parsed_items'] = df['items'].apply(json.loads)
sensor_df = pd.DataFrame(df['parsed_items'].to_list())

In [3]:
# --- Step 2: Interpolate sensors to 1000 rows ---
sensor_df.index = pd.Index(range(len(sensor_df)))
new_index = pd.Index(np.linspace(0, len(sensor_df) - 1, 100000))
sensor_df_1000 = sensor_df.reindex(new_index).interpolate(method='linear').round(2)
sensor_df_1000 = sensor_df_1000.reset_index(drop=True)


In [4]:
# Estimate std deviation per sensor (column-wise) from the original data
std_per_sensor = sensor_df.std()
# Inject small Gaussian noise scaled to each sensor
noise = np.random.normal(loc=0, scale=std_per_sensor/25, size=sensor_df_1000.shape)
sensor_df_1000_noisy = (sensor_df_1000 + noise).round(2)

In [5]:
# Choose 1%–2% of rows to insert anomalies into (e.g., 10–20 points)
num_anomalies = 50
anomaly_rows = random.sample(range(len(sensor_df_1000_noisy)), num_anomalies)

for row in anomaly_rows:
    # Generate one random spike value per sensor (20–62°C)
    spike_values = np.random.uniform(20, 62, size=sensor_df_1000_noisy.shape[1])
    
    # Inject all values at once into the row
    sensor_df_1000_noisy.iloc[row] = np.round(spike_values, 2)

In [6]:
anomaly_rows

[20170,
 18808,
 32666,
 21600,
 36430,
 21501,
 10534,
 25649,
 50633,
 87072,
 20140,
 43996,
 34205,
 13372,
 52461,
 88038,
 69788,
 3072,
 2633,
 46783,
 66254,
 81332,
 62893,
 13529,
 58751,
 47299,
 24401,
 12144,
 81854,
 72670,
 87755,
 98929,
 22208,
 17394,
 30194,
 82322,
 2162,
 28267,
 23932,
 16728,
 85865,
 68718,
 22462,
 86218,
 67832,
 14369,
 45863,
 17464,
 81006,
 99811]

In [8]:
# --- Step 3: Create 1000 timestamps in descending order ---
df['packet_time'] = pd.to_datetime(df['packet_time'])
start_time = df['packet_time'].iloc[0]  # latest time
end_time = df['packet_time'].iloc[-1]   # oldest time

In [10]:
# Generate 1000 evenly spaced times between start and end
packet_times = pd.date_range(start=start_time, end=end_time, periods=100000)

In [11]:
# --- Step 4: Rebuild final DataFrame ---
df_expanded = pd.DataFrame({
    'packet_time': packet_times,
    'items': sensor_df_1000_noisy.apply(lambda row: json.dumps(row.to_dict()), axis=1)
})

In [12]:
df_expanded['boot_count'] = 6450
df_expanded['spacecraft'] = "DEFAULT"
df_expanded['target'] = "THERMAL"
df_expanded['packet'] = "THERMAL_HK"


In [13]:
df_expanded.to_csv('100000_thermal_data.csv', index=False)