### Healthcare – Patient Data Accuracy

**Task 1**: Patient Record Accuracy Assessment

**Objective**: Achieve high accuracy in patient records.

**Steps**:
1. Examine a sample patient dataset for common inaccuracies.
2. Identify at least three common issues, such as medication errors or misdiagnoses.
3. Propose validation measures to ensure data accuracy at the point of entry.

In [1]:
# Write your code from here
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Simulate a sample patient dataset
np.random.seed(42)
num_patients = 500
start_date = datetime(1950, 1, 1)
end_date = datetime(2020, 1, 1)
dates_of_birth = [start_date + (end_date - start_date) * np.random.rand() for _ in range(num_patients)]
genders = np.random.choice(['Male', 'Female', 'Other'], num_patients, p=[0.45, 0.5, 0.05])
first_names_m = ['John', 'Robert', 'Michael', 'William', 'David']
last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones']
first_names_f = ['Mary', 'Patricia', 'Linda', 'Barbara', 'Elizabeth']
first_names_o = ['Alex', 'Jamie', 'Riley', 'Jordan', 'Casey']

names = []
for i in range(num_patients):
    if genders[i] == 'Male':
        names.append(f"{np.random.choice(first_names_m)} {np.random.choice(last_names)}")
    elif genders[i] == 'Female':
        names.append(f"{np.random.choice(first_names_f)} {np.random.choice(last_names)}")
    else:
        names.append(f"{np.random.choice(first_names_o)} {np.random.choice(last_names)}")

patient_ids = [f"PID{i+1:05d}" for i in range(num_patients)]
addresses = [f"{np.random.randint(100, 999)} Main St, Anytown" for _ in range(num_patients)]
conditions = [np.random.choice(['Diabetes', 'Hypertension', 'Asthma', None], p=[0.1, 0.15, 0.08, 0.67]) for _ in range(num_patients)]
allergies = [np.random.choice(['Penicillin', 'Latex', 'Peanuts', None], p=[0.05, 0.03, 0.02, 0.9]) for _ in range(num_patients)]
medications = [np.random.choice(['Metformin', 'Lisinopril', 'Albuterol', None], p=[0.08, 0.12, 0.05, 0.75]) for _ in range(num_patients)]
diagnoses = [np.random.choice(['Type 2 Diabetes', 'Essential Hypertension', 'Mild Asthma', None], p=[0.09, 0.14, 0.07, 0.7]) for _ in range(num_patients)]

patient_data = pd.DataFrame({
    'Patient ID': patient_ids,
    'Name': names,
    'Date of Birth': dates_of_birth,
    'Gender': genders,
    'Address': addresses,
    'Medical Condition': conditions,
    'Allergies': allergies,
    'Medication': medications,
    'Diagnosis': diagnoses
})

# Introduce some common inaccuracies
# Typo in medication
patient_data.loc[50, 'Medication'] = 'Metforminn'
# Incorrect date of birth (future date)
patient_data.loc[100, 'Date of Birth'] = datetime(2026, 1, 1)
# Inconsistent gender
patient_data.loc[150, 'Gender'] = 'MALE'
# Missing diagnosis
patient_data.loc[200, 'Diagnosis'] = None
# Implausible age (e.g., born in the future) - already covered by future date
# Wrong allergy (e.g., listed as allergic to a medication they are prescribed)
patient_data.loc[250, ['Allergies', 'Medication']] = ['Penicillin', 'Penicillin']

print("Sample Patient Data with Inaccuracies:")
print(patient_data.head())

Sample Patient Data with Inaccuracies:
  Patient ID              Name              Date of Birth  Gender  \
0   PID00001  Barbara Williams 1976-03-20 20:48:47.684493  Female   
1   PID00002   Elizabeth Smith 2016-07-19 21:54:14.859273  Female   
2   PID00003       David Smith 2001-03-28 21:20:19.129246    Male   
3   PID00004    Patricia Smith 1991-11-27 21:38:06.616231  Female   
4   PID00005  Barbara Williams 1960-12-02 22:17:09.328569  Female   

                Address Medical Condition   Allergies  Medication  \
0  858 Main St, Anytown              None        None   Albuterol   
1  901 Main St, Anytown      Hypertension        None   Metformin   
2  875 Main St, Anytown            Asthma        None  Lisinopril   
3  825 Main St, Anytown              None  Penicillin  Lisinopril   
4  299 Main St, Anytown              None        None        None   

         Diagnosis  
0             None  
1             None  
2  Type 2 Diabetes  
3             None  
4             None  


**Task 2**: Implement Healthcare Data Quality Checks

**Objective**: Maintain accurate health records within a healthcare system.

**Steps**:
1. Develop a validation workflow for patient data.
2. Use appropriate software to automate checks for common errors.

In [2]:
# Write your code from here
import pandas as pd
from datetime import datetime
import numpy as np

# Re-create the sample patient dataset with inaccuracies (as in the previous response)
np.random.seed(42)
num_patients = 500
start_date = datetime(1950, 1, 1)
end_date = datetime(2020, 1, 1)
dates_of_birth = [start_date + (end_date - start_date) * np.random.rand() for _ in range(num_patients)]
genders = np.random.choice(['Male', 'Female', 'Other'], num_patients, p=[0.45, 0.5, 0.05])
first_names_m = ['John', 'Robert', 'Michael', 'William', 'David']
last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones']
first_names_f = ['Mary', 'Patricia', 'Linda', 'Barbara', 'Elizabeth']
first_names_o = ['Alex', 'Jamie', 'Riley', 'Jordan', 'Casey']
names = []
for i in range(num_patients):
    if genders[i] == 'Male':
        names.append(f"{np.random.choice(first_names_m)} {np.random.choice(last_names)}")
    elif genders[i] == 'Female':
        names.append(f"{np.random.choice(first_names_f)} {np.random.choice(last_names)}")
    else:
        names.append(f"{np.random.choice(first_names_o)} {np.random.choice(last_names)}")
patient_ids = [f"PID{i+1:05d}" for i in range(num_patients)]
addresses = [f"{np.random.randint(100, 999)} Main St, Anytown" for _ in range(num_patients)]
conditions = [np.random.choice(['Diabetes', 'Hypertension', 'Asthma', None], p=[0.1, 0.15, 0.08, 0.67]) for _ in range(num_patients)]
allergies = [np.random.choice(['Penicillin', 'Latex', 'Peanuts', None], p=[0.05, 0.03, 0.02, 0.9]) for _ in range(num_patients)]
medications = [np.random.choice(['Metformin', 'Lisinopril', 'Albuterol', None], p=[0.08, 0.12, 0.05, 0.75]) for _ in range(num_patients)]
diagnoses = [np.random.choice(['Type 2 Diabetes', 'Essential Hypertension', 'Mild Asthma', None], p=[0.09, 0.14, 0.07, 0.7]) for _ in range(num_patients)]
patient_data = pd.DataFrame({
    'Patient ID': patient_ids,
    'Name': names,
    'Date of Birth': dates_of_birth,
    'Gender': genders,
    'Address': addresses,
    'Medical Condition': conditions,
    'Allergies': allergies,
    'Medication': medications,
    'Diagnosis': diagnoses
})
patient_data.loc[50, 'Medication'] = 'Metforminn'
patient_data.loc[100, 'Date of Birth'] = datetime(2026, 1, 1)
patient_data.loc[150, 'Gender'] = 'MALE'
patient_data.loc[200, 'Diagnosis'] = None
patient_data.loc[250, ['Allergies', 'Medication']] = ['Penicillin', 'Penicillin']

def check_medication_typos(df, column='Medication'):
    """Identifies potential medication typos (simple example)."""
    known_medications = ['Metformin', 'Lisinopril', 'Albuterol'] # Add more known medications
    typos = df[~df[column].isin(known_medications) & df[column].notna()]
    return typos

def check_future_dates(df, column='Date of Birth'):
    """Identifies records with future dates."""
    now = datetime.now()
    future_dates = df[pd.to_datetime(df[column]) > now]
    return future_dates

def check_gender_consistency(df, column='Gender'):
    """Standardizes gender entries and identifies inconsistencies."""
    df[column] = df[column].str.lower().str.strip()
    inconsistent = df[~df[column].isin(['male', 'female', 'other', None])]
    return inconsistent

def check_missing_values(df, columns):
    """Identifies records with missing values in specified columns."""
    missing = df[df[columns].isnull().any(axis=1)]
    return missing

def check_allergy_medication_conflict(df, allergy_col='Allergies', medication_col='Medication'):
    """Flags patients allergic to their prescribed medication (simple check)."""
    conflicts = df[df[allergy_col] == df[medication_col] & df[allergy_col].notna() & df[medication_col].notna()]
    return conflicts

# Apply the validation checks
medication_typos = check_medication_typos(patient_data)
future_birth_dates = check_future_dates(patient_data)
inconsistent_genders = check_gender_consistency(patient_data)
missing_diagnosis = check_missing_values(patient_data, columns=['Diagnosis'])
allergy_medication_conflicts = check_allergy_medication_conflict(patient_data)

print("\nPotential Medication Typoos:")
print(medication_typos[['Patient ID', 'Medication']])

print("\nPatients with Future Birth Dates:")
print(future_birth_dates[['Patient ID', 'Date of Birth']])

print("\nInconsistent Gender Entries:")
print(inconsistent_genders[['Patient ID', 'Gender']])

print("\nPatients with Missing Diagnosis:")
print(missing_diagnosis[['Patient ID', 'Diagnosis']])

print("\nPatients with Allergy-Medication Conflicts:")
print(allergy_medication_conflicts[['Patient ID', 'Allergies', 'Medication']])

# Simulate Point-of-Entry Validation for a new patient record
def validate_new_patient(patient_record):
    errors = []
    if pd.isna(patient_record['Name']):
        errors.append("Name is missing.")
    if pd.isna(patient_record['Date of Birth']) or pd.to_datetime(patient_record['Date of Birth']) > datetime.now():
        errors.append("Date of Birth is invalid.")
    if patient_record['Gender'].lower().strip() not in ['male', 'female', 'other']:
        errors.append("Gender is invalid.")
    # Add more validation rules here (e.g., format checks, mandatory fields)
    return errors

new_patient = {
    'Patient ID': 'PID00501',
    'Name': 'Jane Doe',
    'Date of Birth': '2027-01-01',
    'Gender': 'FEMALE ',
    'Address': '123 Oak St',
    'Medical Condition': None,
    'Allergies': 'None',
    'Medication': 'Aspirin',
    'Diagnosis': 'Headache'
}

validation_errors = validate_new_patient(new_patient)
if validation_errors:
    print("\nValidation Errors for New Patient:")
    for error in validation_errors:
        print(f"- {error}")
else:
    print("\nNew Patient Record passed initial validation.")

TypeError: unsupported operand type(s) for &: 'str' and 'bool'