# Generate Cleaned Dataset

This notebook extracts the essential columns from `final_data.csv` for analysis.

**Columns Included:**
- Patient ID
- Visit ID
- Triage End Timestamp
- Doctor Seen Timestamp
- Exit Timestamp
- Doctors On Duty
- Nurses On Duty
- Specialists On Call
- Shift Type
- Triage Level
- Disposition

## Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

## Load the Final Data

In [None]:
# Define file paths
input_file = '/Users/mukeshravichandran/Datathon/final_data.csv'
output_file = '/Users/mukeshravichandran/Datathon/final_data_cleaned.csv'

# Read the final_data.csv
df = pd.read_csv(input_file)

print(f"Original dataset shape: {df.shape}")
print(f"\nColumn names in original data:")
print(df.columns.tolist())

## Select and Clean Columns

In [None]:
# Define columns needed
columns_needed = [
    'Patient ID',
    'Visit ID',
    'Triage End',
    'Doctor Seen',
    'Exit Time',
    'Doctors On Duty',
    'Nurses On Duty',
    'Specialists On Call',
    'Shift',
    'Triage Level',
    'Disposition'
]

# Create the cleaned dataset
cleaned_df = df[columns_needed].copy()

# Rename columns for clarity
cleaned_df = cleaned_df.rename(columns={
    'Triage End': 'Triage End Timestamp',
    'Doctor Seen': 'Doctor Seen Timestamp',
    'Exit Time': 'Exit Timestamp',
    'Shift': 'Shift Type'
})

print(f"Cleaned dataset shape: {cleaned_df.shape}")
print(f"\nNew column names:")
print(cleaned_df.columns.tolist())

## Data Quality Check

In [None]:
# Display first few rows
print("First 10 rows of cleaned data:")
print(cleaned_df.head(10))

print("\n" + "="*80)
print("Data Types:")
print(cleaned_df.dtypes)

print("\n" + "="*80)
print("Missing Values:")
print(cleaned_df.isnull().sum())

print("\n" + "="*80)
print("Summary Statistics:")
print(cleaned_df.describe())

## Verify Unique Values

In [None]:
print("Unique Shift Types:")
print(cleaned_df['Shift Type'].unique())
print(f"Count: {cleaned_df['Shift Type'].nunique()}\n")

print("Unique Triage Levels:")
print(sorted(cleaned_df['Triage Level'].unique()))
print(f"Count: {cleaned_df['Triage Level'].nunique()}\n")

print("Unique Dispositions:")
print(cleaned_df['Disposition'].unique())
print(f"Count: {cleaned_df['Disposition'].nunique()}")

## Save Cleaned Dataset

In [None]:
# Save the cleaned dataset
cleaned_df.to_csv(output_file, index=False)

print(f"✅ Cleaned dataset saved to: {output_file}")
print(f"\nTotal rows: {len(cleaned_df):,}")
print(f"Total columns: {len(cleaned_df.columns)}")
print(f"\nFile size: {Path(output_file).stat().st_size / (1024**2):.2f} MB")

## Summary

In [None]:
# Display final summary
print("\n" + "="*80)
print("CLEANED DATASET SUMMARY")
print("="*80)
print(f"\n📊 Dataset Info:")
print(f"   Rows: {len(cleaned_df):,}")
print(f"   Columns: {len(cleaned_df.columns)}")
print(f"\n📋 Columns Included:")
for idx, col in enumerate(cleaned_df.columns, 1):
    print(f"   {idx}. {col}")
print(f"\n✅ Output file: {output_file}")
print("="*80)