# **Smart Lighting System:** Data Cleaning

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from google.colab import files

In [2]:
# Read data file
df = pd.read_csv('lighting_data.csv')

# Display data
df

Unnamed: 0,light_id,location_name,fault_type,timestamp,severity_level,fault_status,maintenance_cost
0,L0226,Pasir Ris,Control system,2020-01-01 01:22:25,High,Acknowledged,343.60
1,L0061,City Hall,Control system,2020-01-01 23:45:18,Low,In Progress,
2,L0455,Pasir Ris,Power-related,2020-01-02 01:33:02,Informational,Resolved,169.20
3,L0924,City Hall,Cybersecurity,2020-01-02 03:34:48,Informational,Resolved,372.45
4,L0134,Canberra,Communication,2020-01-02 08:09:16,Low,In Progress,335.57
...,...,...,...,...,...,...,...
461,L0835,Orchard Road,Sensor-related,2020-12-24 23:46:51,Medium,Acknowledged,482.02
462,L0452,Jurong West,Environmental,2020-12-25 02:25:42,Informational,Detected,277.74
463,L0483,Bishan,Sensor-related,2020-12-25 02:34:36,Critical,In Progress,198.08
464,L0683,Canberra,Control system,2020-12-29 20:00:19,Critical,In Progress,236.10


In [3]:
# Dataset(rows, columns)
df.shape

(466, 7)

In [4]:
# Display first five data
df.head()

Unnamed: 0,light_id,location_name,fault_type,timestamp,severity_level,fault_status,maintenance_cost
0,L0226,Pasir Ris,Control system,2020-01-01 01:22:25,High,Acknowledged,343.6
1,L0061,City Hall,Control system,2020-01-01 23:45:18,Low,In Progress,
2,L0455,Pasir Ris,Power-related,2020-01-02 01:33:02,Informational,Resolved,169.2
3,L0924,City Hall,Cybersecurity,2020-01-02 03:34:48,Informational,Resolved,372.45
4,L0134,Canberra,Communication,2020-01-02 08:09:16,Low,In Progress,335.57


In [5]:
# Display last five data
df.tail()

Unnamed: 0,light_id,location_name,fault_type,timestamp,severity_level,fault_status,maintenance_cost
461,L0835,Orchard Road,Sensor-related,2020-12-24 23:46:51,Medium,Acknowledged,482.02
462,L0452,Jurong West,Environmental,2020-12-25 02:25:42,Informational,Detected,277.74
463,L0483,Bishan,Sensor-related,2020-12-25 02:34:36,Critical,In Progress,198.08
464,L0683,Canberra,Control system,2020-12-29 20:00:19,Critical,In Progress,236.1
465,L0934,Bugis Junction,Cybersecurity,2020-12-29 20:44:12,Informational,Detected,399.61


In [6]:
# Check data types & basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466 entries, 0 to 465
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   light_id          466 non-null    object 
 1   location_name     466 non-null    object 
 2   fault_type        466 non-null    object 
 3   timestamp         466 non-null    object 
 4   severity_level    466 non-null    object 
 5   fault_status      466 non-null    object 
 6   maintenance_cost  414 non-null    float64
dtypes: float64(1), object(6)
memory usage: 25.6+ KB


In [7]:
# Find missing values
missing_count = df.isnull().sum()
missing_count

Unnamed: 0,0
light_id,0
location_name,0
fault_type,0
timestamp,0
severity_level,0
fault_status,0
maintenance_cost,52


The maintenance_cost column has 52 missing values.

In [8]:
# Find percentage of missing values
missing_count = df.isna().sum()
total_rows = len(df)
missing_proportion = (missing_count / total_rows) * 100

# Sort descending
missing_proportion = missing_proportion.round(2).sort_values(ascending=False)
print(missing_proportion)

maintenance_cost    11.16
location_name        0.00
light_id             0.00
fault_type           0.00
timestamp            0.00
severity_level       0.00
fault_status         0.00
dtype: float64


Since maintenance cost is a numeric column and may contain outliers, I will apply the median to fill its missing values.

In [9]:
# Find median of 'maintenance_cost' column
median_value = df['maintenance_cost'].median()
median_rounded = round(median_value, 2)
print("Median value:", median_rounded)

Median value: 271.8


In [10]:
# Fill missing values with median
df['maintenance_cost'].fillna(median_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['maintenance_cost'].fillna(median_value, inplace=True)


In [11]:
# Check duplicate rows
num_duplicates = df.duplicated().sum()
print(f"Duplicate rows: {num_duplicates}")

Duplicate rows: 0


In [12]:
# Verify filling to ensure no null values
print(df['maintenance_cost'].isnull().sum())

0


In [13]:
# Convert timestamps to pandas datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort by timestamp
df = df.sort_values('timestamp').reset_index(drop=True)

In [14]:
# Verify data type
print(df['timestamp'].dtype)

datetime64[ns]


In [15]:
# Verify sort
is_sorted = df['timestamp'].equals(df['timestamp'].sort_values())
print("Is sorted:", is_sorted)

Is sorted: True


In [16]:
# Standardise categorical values
df['fault_type'] = df['fault_type'].str.strip().str.title()
df['severity_level'] = df['severity_level'].str.strip().str.title()
df['fault_status'] = df['fault_status'].str.strip().str.title()
df['location_name'] = df['location_name'].str.strip().str.title()

In [17]:
# Extract time features
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['day_of_week'] = df['timestamp'].dt.day_name()
df['hour'] = df['timestamp'].dt.hour

In [18]:
filename='lighting_data_cleaned.csv'

# Save to csv and download
df.to_csv(filename, index=False) # Save csv
files.download(filename) # Automatically download the file in colab

# Read final dataset
df

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,light_id,location_name,fault_type,timestamp,severity_level,fault_status,maintenance_cost,year,month,day,day_of_week,hour
0,L0226,Pasir Ris,Control System,2020-01-01 01:22:25,High,Acknowledged,343.60,2020,1,1,Wednesday,1
1,L0061,City Hall,Control System,2020-01-01 23:45:18,Low,In Progress,271.80,2020,1,1,Wednesday,23
2,L0455,Pasir Ris,Power-Related,2020-01-02 01:33:02,Informational,Resolved,169.20,2020,1,2,Thursday,1
3,L0924,City Hall,Cybersecurity,2020-01-02 03:34:48,Informational,Resolved,372.45,2020,1,2,Thursday,3
4,L0134,Canberra,Communication,2020-01-02 08:09:16,Low,In Progress,335.57,2020,1,2,Thursday,8
...,...,...,...,...,...,...,...,...,...,...,...,...
461,L0835,Orchard Road,Sensor-Related,2020-12-24 23:46:51,Medium,Acknowledged,482.02,2020,12,24,Thursday,23
462,L0452,Jurong West,Environmental,2020-12-25 02:25:42,Informational,Detected,277.74,2020,12,25,Friday,2
463,L0483,Bishan,Sensor-Related,2020-12-25 02:34:36,Critical,In Progress,198.08,2020,12,25,Friday,2
464,L0683,Canberra,Control System,2020-12-29 20:00:19,Critical,In Progress,236.10,2020,12,29,Tuesday,20
