In [1]:
import pandas as pd
import numpy as np
import random

# Set random seed
np.random.seed(42)
random.seed(42)

# Base values
num_rows = 50000
experiment_ids = range(1, num_rows + 1)
chemicals = ["Solvent A", "Solvent B", "Catalyst C", "Reactant D", "Buffer E"]
researchers = ["Sandra", "Lea", "Marion", "Joseph", "Ernest"]
statuses = ["Success", "Failure", "Partial Success"]

# Simulate inconsistent units (grams and milligrams)
quantities = np.random.uniform(0.1, 50, size=num_rows)
unit_choices = np.random.choice(['g', 'mg'], size=num_rows, p=[0.9, 0.1])
quantity_column = [q * 1000 if u == 'mg' else q for q, u in zip(quantities, unit_choices)]

# Simulate inconsistent temperature formats (°C and °F)
temperatures_C = np.random.uniform(20, 200, size=num_rows)
temp_units = np.random.choice(['C', 'F'], size=num_rows, p=[0.85, 0.15])
temperature_column = [
    t * 1.8 + 32 if u == 'F' else t for t, u in zip(temperatures_C, temp_units)
]

# Time inconsistencies: some in minutes, some in hours
reaction_time = np.random.randint(5, 300, size=num_rows)
time_units = np.random.choice(['min', 'hr'], size=num_rows, p=[0.9, 0.1])
reaction_time_column = [
    t * 60 if u == 'hr' else t for t, u in zip(reaction_time, time_units)
]

# Introduce missing values randomly in catalyst and success_status
chemicals_with_nans = np.random.choice(chemicals + [np.nan], size=num_rows, p=[0.18, 0.18, 0.18, 0.18, 0.18, 0.1])
statuses_with_nans = np.random.choice(statuses + [np.nan], size=num_rows, p=[0.3, 0.3, 0.3, 0.1])

# Create DataFrame
lab_data_messy = pd.DataFrame({
    "experiment_id": experiment_ids,
    "researcher": np.random.choice(researchers, size=num_rows),
    "chemical_used": chemicals_with_nans,
    "quantity": quantity_column,
    "quantity_unit": unit_choices,
    "temperature": temperature_column,
    "temperature_unit": temp_units,
    "pressure_atm": np.random.uniform(0.5, 10, size=num_rows).round(2),
    "reaction_time": reaction_time_column,
    "reaction_time_unit": time_units,
    "success_status": statuses_with_nans
})

# Introduce some success rate outliers (optional)
outlier_indices = np.random.choice(lab_data_messy.index, size=30, replace=False)
lab_data_messy.loc[outlier_indices, "reaction_time"] = -999

# Save to CSV
lab_data_messy.to_csv("lab_notebook_data_messy.csv", index=False)
print("✅ Messy lab notebook dataset created and saved as 'lab_notebook_data_messy.csv'")

✅ Messy lab notebook dataset created and saved as 'lab_notebook_data_messy.csv'
