In [10]:
# ------Create synthetic dataset: smart lighting system------
# Import libraries
import pandas as pd # Data manipulation and analysis
import numpy as np # Numerical operations and arrays
import random # Generate random numbers and random selections
from datetime import datetime, timedelta # datetime: format dates and times, timedelta: add or subtract time durations (days or hours)
from google.colab import files # Upload and download files

# List of categorised fault types
fault_types = [
    'Power-related', 'Communication', 'Sensor-related',
    'Control system', 'Environmental', 'Cybersecurity'
]

# List of categorised severity levels
severity_levels = ['Low', 'Medium', 'High', 'Critical', 'Informational']

# List of fault statuses representing different stages
fault_statuses = ['Detected', 'Acknowledged', 'In Progress', 'Resolved', 'Closed']

# List of monitored locations
locations = [
    'City Hall', 'Outram Park', 'Pasir Ris', 'Jurong West',
    'Orchard Road', 'Toa Payoh', 'Canberra', 'Bugis Junction',
    'Bishan', 'Chinatown'
]

# Generate light ids and assign random locations
def generate_light_ids(num_lights):  # Function takes the total number of lights to generate
    light_ids = []  # Create empty list to store ids
    light_locations = {}  # Create empty dictionary to map each light id to its location

    # Loop for each light
    for i in range(1, num_lights + 1): # for each light from 1 to number_of_lights
        light_id = f'L{i:04d}'  # Create id: L0001
        location_name = random.choice(locations)  # Pick a random location from the locations list
        light_ids.append(light_id)  # Add the generated id to the list
        light_locations[light_id] = {'location_name': location_name}  # Store the location info keyed by light id
    return light_ids, light_locations  # Return both the list of ids and maps each light id to its location

# Generate a random number of fault records
def generate_fault_records(light_ids, light_locations, start_date, end_date,  min_records=100, max_records=300): # function will pick a random total faults between 100 and 300
    num_records = random.randint(min_records, max_records) # Random total records count
    records = []
    total_seconds = (end_date - start_date).total_seconds() # Total seconds between dates

    for _ in range(num_records):
        # Randomly pick a light and get its location
        light_id = random.choice(light_ids) # Pick a random light_id from light_ids
        location_name = light_locations[light_id]['location_name'] # Get location information for that light

        # Randomly select fault details
        fault_type = random.choice(fault_types) # Pick a random fault type
        severity_level = random.choice(severity_levels) # Pick a random severity level
        fault_status = random.choice(fault_statuses) # Pick a random fault status

        # Generate a random timestamp between start and end date, rounded to seconds
        random_seconds = random.uniform(0, total_seconds) # Pick a random seconds within the range, random.uniform(0, total_seconds) gives a random amount of time (in seconds) between the start and end dates
        timestamp = (start_date + timedelta(seconds=random_seconds)).replace(microsecond=0) # Add random seconds to start_date

        # Assign a maintenance cost (90% chance to have a value, 10% chance missing value)
        if random.random() > 0.1:  # 90% of the time
            maintenance_cost = round(random.uniform(50, 500), 2)  # Cost between $50 and $500
        else:  # 10% of the time
            maintenance_cost = np.nan  # Missing value

         # Create a dictionary representing one fault record
        record = {
            'light_id': light_id,  # Unique identifier of the street light
            'location_name': location_name, # Name of the light's location
            'fault_type': fault_type, # Current fault type from the list
            'timestamp': timestamp, # Time when the fault was recorded
            'severity_level': severity_level,  # Severity level of the fault
            'fault_status': fault_status, # Current status of the fault
            'maintenance_cost': maintenance_cost  # Estimated cost to fix the fault
        }

        records.append(record) # Add the fault record to the list of records
    # Return the list containing one record per fault type
    return records

# Generate synthetic fault data
def generate_data(num_lights=1000, min_faults=100, max_faults=300,
                  start_date=datetime(2020, 1, 1),
                  end_date=datetime(2020, 12, 31),
                  save_csv=True,
                  filename='lighting_data.csv'):

    # Display how many records will be generated
    print(f"Generating between {min_faults} and {max_faults} fault records for {num_lights} lights.")

    # 1. Generate light ids and random locations for each light
    light_ids, light_locations = generate_light_ids(num_lights)

    # 2. Generate random fault records for the lights
    data = generate_fault_records(light_ids, light_locations, start_date, end_date, min_faults, max_faults)

    # 3. Convert the list of fault records into a pandas dataframe
    df = pd.DataFrame(data)  # Convert the list of records into a table
    df = df.sort_values(by='timestamp').reset_index(drop=True) # Sort the table by time

    # 4. Save to csv and download
    if save_csv:
        df.to_csv(filename, index=False) # Save dataframe to csv
        files.download(filename) # Automatically download the file in colab
    return df

# Run data colab
df = generate_data()
df

Generating between 100 and 300 fault records for 1000 lights.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,light_id,location_name,fault_type,timestamp,severity_level,fault_status,maintenance_cost
0,L0058,Canberra,Sensor-related,2020-01-03 19:51:36,High,In Progress,247.01
1,L0950,Outram Park,Sensor-related,2020-01-05 00:35:27,Low,Acknowledged,71.50
2,L0759,Orchard Road,Power-related,2020-01-05 13:21:17,High,Closed,112.90
3,L0498,City Hall,Control system,2020-01-07 05:01:16,High,In Progress,340.43
4,L0306,Canberra,Environmental,2020-01-08 10:28:21,Critical,Acknowledged,497.78
...,...,...,...,...,...,...,...
254,L0475,City Hall,Environmental,2020-12-28 13:39:44,Critical,In Progress,459.92
255,L0789,Chinatown,Communication,2020-12-28 13:48:13,Low,Detected,213.63
256,L0731,Pasir Ris,Power-related,2020-12-29 15:40:30,Informational,Detected,376.10
257,L0867,Pasir Ris,Communication,2020-12-30 08:33:29,Informational,Closed,352.90
