# Data Cleaning and Labeling for Safety Observations

This notebook cleans the observation data and labels each entry as 'Safe' or 'Unsafe' based on the type of reporting.

In [None]:
import pandas as pd

# Load Excel
file_path = "Observations_Raw.xlsx"
excel_data = pd.ExcelFile(file_path)
df = excel_data.parse('ACTIVE Cards')

# Clean column names
df.columns = [col.strip().replace('\n', ' ').replace('\r', ' ').replace('  ', ' ') for col in df.columns]

# Keep only necessary columns
df = df[['Type of reporting', 'Observation']]

# Drop duplicate observations
df = df.drop_duplicates(subset='Observation')

# Remove 'Proposed Improvement'
df = df[df['Type of reporting'] != 'Proposed Improvement']

# Remove null or empty observations
df = df[df['Observation'].notna() & (df['Observation'].str.strip() != '')]

# Add Label column
def assign_label(x):
    if x == "Re ACTIVE (Negative Observation)" or x == "Unsafe Act":
        return "Unsafe"
    elif x == "Pro ACTIVE (Positive Observation)":
        return "Safe"
    else:
        return ""

df['Label'] = df['Type of reporting'].apply(assign_label)

# Reset index
df = df.reset_index(drop=True)

# Summary info
print("Total rows after cleaning:", len(df))
df.head()


In [None]:
df.tail()