In [3]:
import pandas as pd
import numpy as np
import random

# Set seeds
np.random.seed(42)
random.seed(42)

# Parameters
num_rows = 100000
experiment_ids = np.random.randint(1, 50001, size=num_rows)
sensors = ["Temp_Sensor_1", "Temp_Sensor_2", "Pressure_Sensor", "Flow_Sensor", "pH_Sensor"]
units_all = ["Celsius", "Kelvin", "atm", "kPa", "L/min", "mL/s", "pH", "pH_units"]

# Create timestamps, allowing some out-of-order and future dates
timestamps = pd.date_range(start="2023-01-01", periods=90000, freq="min").tolist()
extra_times = pd.date_range(start="2028-01-01", periods=10000, freq="min").tolist()
all_times = timestamps + extra_times
random.shuffle(all_times)
all_times = np.random.choice(all_times, size=num_rows)

# Simulate readings, some as strings, some missing
readings = np.random.uniform(0.1, 100, size=num_rows).round(3)
reading_strings = readings.astype(str)
mask_strings = np.random.choice([True, False], size=num_rows, p=[0.05, 0.95])
reading_values = [str(val) if string_flag else val for val, string_flag in zip(readings, mask_strings)]

# Introduce missing values and outliers
sensor_ids = np.random.choice(sensors + [np.nan], size=num_rows, p=[0.19, 0.19, 0.19, 0.19, 0.19, 0.05])
units = np.random.choice(units_all, size=num_rows)

# Add some outlier readings
outlier_indices = np.random.choice(range(num_rows), size=50, replace=False)
for idx in outlier_indices:
    reading_values[idx] = -9999

# Create DataFrame
iot_sensor_data_messy = pd.DataFrame({
    "timestamp": all_times,
    "sensor_id": sensor_ids,
    "experiment_id": experiment_ids,
    "reading_value": reading_values,
    "unit": units
})

# Duplicate some rows to simulate redundancy
duplicates = iot_sensor_data_messy.sample(500, random_state=1)
iot_sensor_data_messy = pd.concat([iot_sensor_data_messy, duplicates], ignore_index=True)

# Shuffle rows
iot_sensor_data_messy = iot_sensor_data_messy.sample(frac=1).reset_index(drop=True)

# Save to CSV
iot_sensor_data_messy.to_csv("iot_sensor_data_messy.csv", index=False)
print("✅ Messy IoT sensor logs dataset created and saved as 'iot_sensor_data_messy.csv'")

✅ Messy IoT sensor logs dataset created and saved as 'iot_sensor_data_messy.csv'
