In [13]:
# ------Create synthetic dataset: smart lighting system data------
# Import libraries
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from google.colab import files

# Constants configuration
fault_types = [
    'Bulb Failure', 'Power Outage', 'Sensor Malfunction',
    'Communication Error', 'Dimming Issue', 'Physical Damage',
    'Wiring Fault', 'Controller Malfunction', 'Overheating'
]
severity_levels = ['Low', 'Medium', 'High', 'Critical']
fault_statuses = ['Reported', 'In Progress', 'Resolved', 'Deferred']
locations = [
    'Margaret Drive', 'Outram Park', 'Pasir Ris', 'Jurong West',
    'Orchard Road', 'Toa Payoh', 'Canberra', 'Bugis Junction',
    'Bishan', 'Chinatown'
]

# Generate light ids
def generate_light_ids(num_lights):  # function takes number_of_lights
    light_ids = []  # create empty list to store ids
    light_locations = {}  # create empty dictionary to store locations

    # Loop for each light
    for i in range(1, num_lights + 1): # for each light from 1 to number_of_lights
        light_id = f'SL{i:04d}'  # create ID: SL0001
        location_name = random.choice(locations)  # pick random location
        light_ids.append(light_id)  # add id to list
        light_locations[light_id] = {'location_name': location_name}  # save location in dictionary
    return light_ids, light_locations  # return both after loop ends

# Generate fault record
def generate_fault_record(light_ids, light_locations, start_date, end_date): # function takes generate_fault_record light_ids, light_locations, start_date, end_date
    # 1. Pick a random light
    light_id = random.choice(light_ids) # choose a random light_id from light_ids
    location_info = light_locations[light_id]

    # 2. Random fault details
    fault_type = random.choice(fault_types)
    severity_level = random.choice(severity_levels)
    fault_status = random.choice(fault_statuses)

    # 3. Random timestamp
    total_seconds = (end_date - start_date).total_seconds() # calculates total time range
    random_seconds = random.uniform(0, total_seconds) # pick a random second within the range
    timestamp = start_date + timedelta(seconds=random_seconds) # timedelta(seconds=3600=1hour)

    # 4. Random numbers for cost, brightness, power
    maintenance_cost = round(random.uniform(50, 500), 2) if random.random() > 0.1 else np.nan # if random number > 0.1 (90% of the time), assign a random cost else (10% of the time), assign np.nan (missing value)
    brightness_level = random.randint(0, 100) # brightness during fault
    power_consumption = round(random.uniform(10, 150), 2) # power during fault

    # 5. Return a record as dictionary
    return {
        'light_id': light_id,
        'location_name': location_info['location_name'],
        'fault_type': fault_type,
        'timestamp': timestamp,
        'severity_level': severity_level,
        'fault_status': fault_status,
        'maintenance_cost': maintenance_cost,
        'brightness_level': brightness_level,
        'power_consumption': power_consumption
    }

# Generate data
def generate_data(num_lights=3000, num_faults=300,
                  start_date=datetime(2020,1,1),
                  end_date=datetime(2020,12, 31),
                  save_csv=True,
                  filename='lighting_data.csv'):

    print(f"Generate {num_faults} fault records for {num_lights} lights.")

    # 1. Generate light ids and locations
    light_ids, light_locations = generate_light_ids(num_lights)

    # 2. Generate multiple fault records
    data = [] # create empty list for records
    for _ in range(num_faults):  # repeat num_faults times
        record = generate_fault_record(light_ids, light_locations, start_date, end_date) # create a fault record
        data.append(record) # add it to list

    # 3. Create dataframe
    df = pd.DataFrame(data) # convert list to dataframe
    df = df.sort_values(by='timestamp').reset_index(drop=True) # sort timestamp

    # 4. Save to csv and download
    if save_csv:
        df.to_csv(filename, index=False) # save dataframe to csv
        print(f"Data saved to {filename}")
        files.download(filename)  # triggers auto-download in colab
    return df

# Run data colab
df = generate_data()
df


Generate 300 fault records for 3000 lights.
Data saved to lighting_data.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,light_id,location_name,fault_type,timestamp,severity_level,fault_status,maintenance_cost,brightness_level,power_consumption
0,SL2474,Bishan,Sensor Malfunction,2020-01-05 03:42:40.132435,Medium,Reported,454.73,61,83.15
1,SL1721,Bugis Junction,Bulb Failure,2020-01-05 09:52:36.102521,Low,Reported,429.52,72,147.94
2,SL2571,Pasir Ris,Power Outage,2020-01-06 04:22:39.534537,Medium,Reported,309.47,15,98.86
3,SL2270,Margaret Drive,Overheating,2020-01-08 03:35:11.439555,High,Deferred,327.25,31,83.20
4,SL2088,Pasir Ris,Wiring Fault,2020-01-08 22:59:11.263899,Medium,Resolved,,35,131.60
...,...,...,...,...,...,...,...,...,...
295,SL2840,Jurong West,Overheating,2020-12-27 10:38:43.456539,Critical,In Progress,343.56,20,91.85
296,SL0509,Margaret Drive,Controller Malfunction,2020-12-27 17:06:54.589495,Medium,Resolved,,69,92.51
297,SL1054,Bugis Junction,Bulb Failure,2020-12-29 13:21:17.785557,Medium,Reported,302.33,92,63.79
298,SL2690,Bugis Junction,Overheating,2020-12-30 03:54:01.365744,Low,Reported,430.70,60,137.01
