<a href="https://colab.research.google.com/github/richardogoma/health-clinic-data-summary-richardogoma/blob/main/Patient_Vitals_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load and display sample records from /content/data/BP.csv
print("Sample records from /data/BP.csv:")
bp_df = pd.read_csv('/data/BP.csv')
print(bp_df.info())
print("\n")

# Load and display sample records from /content/data/five vital signs of normal people.csv
print("Sample records from /data/five vital signs of normal people.csv:")
vital_signs_df = pd.read_csv('/data/five vital signs of normal people.csv')
print(vital_signs_df.info())

In [None]:
merged_df = pd.merge(bp_df, vital_signs_df, left_on='SN', right_on='Subject ID', how='inner')

merged_df = merged_df.rename(columns={
    'Subject ID': 'patient_id',
    'Age': 'age',
    'Temperature reading 1': 'temperature',
    'Heart rate reading 1': 'heart_rate',
    'Systolic blood pressure reading 1': 'systolic_bp',
    'Oxygen saturation reading 1': 'oxygen_saturation'
})

# Display the first few rows of the combined and renamed DataFrame
print("Combined and renamed DataFrame sample records:")
print(merged_df.head())

In [None]:
selected_columns = ['SN', 'patient_id', 'age', 'temperature', 'heart_rate', 'systolic_bp', 'oxygen_saturation']
final_df = merged_df[selected_columns]

print("DataFrame with only selected columns:")
print(final_df.head())

In [None]:
print("Current data types of final_df:")
print(final_df.info())

print("\nNull counts for relevant columns in final_df:")
print(final_df[['patient_id', 'heart_rate', 'systolic_bp', 'oxygen_saturation']].isnull().sum())

In [None]:
# Convert columns to nullable integer type if they were originally integers and now contain NaNs
# 'patient_id' was 'Subject ID' (int64), 'heart_rate' was 'Heart rate reading 1' (int64),
# 'systolic_bp' was 'Systolic blood pressure reading 1' (int64), 'oxygen_saturation' was 'Oxygen saturation reading 1' (int64)

# Note: 'temperature' was 'Temperature reading 1' (float64) originally, so it should remain float.

# Create a copy to avoid SettingWithCopyWarning
final_df_copy = final_df.copy()

for col in ['patient_id', 'heart_rate', 'systolic_bp', 'oxygen_saturation']:
    if final_df_copy[col].dtype == 'float64' and final_df_copy[col].isnull().any():
        final_df_copy.loc[:, col] = final_df_copy[col].astype('Int64') # Capital 'I' makes it nullable integer
    elif final_df_copy[col].dtype == 'float64' and not final_df_copy[col].isnull().any():
        # If float but no NaNs, convert to non-nullable int if all values are whole numbers
        if (final_df_copy[col] == final_df_copy[col].astype(int)).all():
            final_df_copy.loc[:, col] = final_df_copy[col].astype(int)

final_df = final_df_copy # Update original final_df with the modified copy

print("\nData types after conversion:")
print(final_df.info())
print("\nUpdated DataFrame with original data types retained (or nullable integers):")
print(final_df.head())

In [None]:
final_df = final_df.drop(columns=['SN'])

print("DataFrame after dropping 'SN' column:")
print(final_df.head())

In [None]:
final_df.to_csv('/content/data/patient_vitals.csv', index=False)
print("Data exported successfully to /content/data/patient_vitals.csv")