# Data Extraction - Part 1

In [34]:
# Imports

import numpy as np
import pandas as pd
from datetime import datetime

In [35]:
# Load Dataset

df = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/healthcare_dataset.csv')
df.head(3)

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,31 01 2024,Matthew Smith,Sons and Miller,Blue Cross,18856.28131,328,Urgent,2 02 2024,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,20 08 2019,Samantha Davies,Kim Inc,Medicare,33643.32729,265,Emergency,26 08 2019,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,22 09 2022,Tiffany Mitchell,Cook PLC,Aetna,27955.09608,205,Emergency,7 10 2022,Aspirin,Normal


In [36]:
# Let us fixed first inconsistencies in our data

df['Name'] = df['Name'].str.lower()
df

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,bobby jackson,30,Male,B-,Cancer,31 01 2024,Matthew Smith,Sons and Miller,Blue Cross,18856.281310,328,Urgent,2 02 2024,Paracetamol,Normal
1,leslie terry,62,Male,A+,Obesity,20 08 2019,Samantha Davies,Kim Inc,Medicare,33643.327290,265,Emergency,26 08 2019,Ibuprofen,Inconclusive
2,danny smith,76,Female,A-,Obesity,22 09 2022,Tiffany Mitchell,Cook PLC,Aetna,27955.096080,205,Emergency,7 10 2022,Aspirin,Normal
3,andrew watts,28,Female,O+,Diabetes,18 11 2020,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.782410,450,Elective,18 12 2020,Ibuprofen,Abnormal
4,adrienne bell,43,Female,AB+,Cancer,19 09 2022,Kathleen Hanna,White-White,Aetna,14238.317810,458,Urgent,9 10 2022,Penicillin,Abnormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55495,elizabeth jackson,42,Female,O+,Asthma,16 08 2020,Joshua Jarvis,Jones-Thompson,Blue Cross,2650.714952,417,Elective,15 09 2020,Penicillin,Abnormal
55496,kyle perez,61,Female,AB-,Obesity,23 01 2020,Taylor Sullivan,Tucker-Moyer,Cigna,31457.797310,316,Elective,1 02 2020,Aspirin,Normal
55497,heather wang,38,Female,B+,Hypertension,13 07 2020,Joe Jacobs DVM,"and Mahoney Johnson Vasquez,",UnitedHealthcare,27620.764720,347,Urgent,10 08 2020,Ibuprofen,Abnormal
55498,jennifer jones,43,Male,O-,Arthritis,25 05 2019,Kimberly Curry,"Jackson Todd and Castro,",Medicare,32451.092360,321,Elective,31 05 2019,Ibuprofen,Abnormal


In [37]:
# Let us fixed first the Dates

df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce', format = "%d %m %Y") # convert to proper format
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], errors='coerce', format = "%d %m %Y") # convert to proper format
df

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,bobby jackson,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281310,328,Urgent,2024-02-02,Paracetamol,Normal
1,leslie terry,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327290,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,danny smith,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096080,205,Emergency,2022-10-07,Aspirin,Normal
3,andrew watts,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.782410,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrienne bell,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317810,458,Urgent,2022-10-09,Penicillin,Abnormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55495,elizabeth jackson,42,Female,O+,Asthma,2020-08-16,Joshua Jarvis,Jones-Thompson,Blue Cross,2650.714952,417,Elective,2020-09-15,Penicillin,Abnormal
55496,kyle perez,61,Female,AB-,Obesity,2020-01-23,Taylor Sullivan,Tucker-Moyer,Cigna,31457.797310,316,Elective,2020-02-01,Aspirin,Normal
55497,heather wang,38,Female,B+,Hypertension,2020-07-13,Joe Jacobs DVM,"and Mahoney Johnson Vasquez,",UnitedHealthcare,27620.764720,347,Urgent,2020-08-10,Ibuprofen,Abnormal
55498,jennifer jones,43,Male,O-,Arthritis,2019-05-25,Kimberly Curry,"Jackson Todd and Castro,",Medicare,32451.092360,321,Elective,2019-05-31,Ibuprofen,Abnormal


In [38]:
# Let us add a column on Days Spent in the Hospital

df['Days Spent'] = (df['Discharge Date']) - (df['Date of Admission'])
df['Days Spent'] = df['Days Spent'].astype(str) # Convert Days Spent to string

df['Days Spent (X)'] = df['Days Spent'].str.split().str[0].astype(int) # Split e.g. 2 Days, retain 2, convert to integer
df = df.drop('Days Spent', axis = 1)

df.rename(columns = {'Days Spent (X)': 'Days Spent'}, inplace = True) # rename column
df

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results,Days Spent
0,bobby jackson,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281310,328,Urgent,2024-02-02,Paracetamol,Normal,2
1,leslie terry,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327290,265,Emergency,2019-08-26,Ibuprofen,Inconclusive,6
2,danny smith,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096080,205,Emergency,2022-10-07,Aspirin,Normal,15
3,andrew watts,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.782410,450,Elective,2020-12-18,Ibuprofen,Abnormal,30
4,adrienne bell,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317810,458,Urgent,2022-10-09,Penicillin,Abnormal,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55495,elizabeth jackson,42,Female,O+,Asthma,2020-08-16,Joshua Jarvis,Jones-Thompson,Blue Cross,2650.714952,417,Elective,2020-09-15,Penicillin,Abnormal,30
55496,kyle perez,61,Female,AB-,Obesity,2020-01-23,Taylor Sullivan,Tucker-Moyer,Cigna,31457.797310,316,Elective,2020-02-01,Aspirin,Normal,9
55497,heather wang,38,Female,B+,Hypertension,2020-07-13,Joe Jacobs DVM,"and Mahoney Johnson Vasquez,",UnitedHealthcare,27620.764720,347,Urgent,2020-08-10,Ibuprofen,Abnormal,28
55498,jennifer jones,43,Male,O-,Arthritis,2019-05-25,Kimberly Curry,"Jackson Todd and Castro,",Medicare,32451.092360,321,Elective,2019-05-31,Ibuprofen,Abnormal,6


In [39]:
# Check info or check for missing data

missing = df.isnull().sum().rename('Missing')
missing
# missing.to_csv('missing values.csv', header = True)

Unnamed: 0,Missing
Name,0
Age,0
Gender,0
Blood Type,0
Medical Condition,0
Date of Admission,0
Doctor,0
Hospital,0
Insurance Provider,0
Billing Amount,0


# Extract the following information - Part 1

In [40]:
# 1. How many Insurance Provider are there in the dataset

df['Insurance Provider'].nunique()

5

In [41]:
# 2. Which Medical Condition is the most prevalent?

df['Medical Condition'].value_counts()
# df.groupby('Medical Condition').count()

Unnamed: 0_level_0,count
Medical Condition,Unnamed: 1_level_1
Arthritis,9308
Diabetes,9304
Hypertension,9245
Obesity,9231
Cancer,9227
Asthma,9185


In [42]:
# 3. What is the median value of Billing Amount?

# df['Billing Amount'].median()
df['Billing Amount'].describe().loc['50%']

25538.06938

In [43]:
# 4. What room number is used least for emergency?

df[df['Admission Type'] == 'Emergency']['Room Number'].value_counts().sort_values()

Unnamed: 0_level_0,count
Room Number,Unnamed: 1_level_1
189,24
101,27
398,29
460,29
259,29
...,...
393,63
420,64
287,64
486,67


In [44]:
# 05. Give me the number of Male and Female Cancer Patients.

df.query("`Medical Condition` == 'Cancer'").value_counts('Gender')
# df[df['Medical Condition'] == 'Cancer'].value_counts('Gender')

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,4625
Female,4602


In [45]:
# 05. Give me the number of Male Obesity Patients for
# each Insurance Provider

df.query("(Gender == 'Male') and (`Medical Condition` == 'Obesity')").value_counts('Insurance Provider')

Unnamed: 0_level_0,count
Insurance Provider,Unnamed: 1_level_1
Blue Cross,953
Medicare,946
Aetna,916
UnitedHealthcare,910
Cigna,884


In [46]:
# 06. Give me the average billing amount for Cancer Patients,
# Female, Blood Type of A+ with Inconclusive results

x = df.query("(`Medical Condition` == 'Cancer') and (Gender == 'Female')")
y = x.query("(`Blood Type` == 'A+') and (`Test Results` == 'Inconclusive')")
y['Billing Amount'].mean()

26013.69686546455

## Extract the following information - Part 2

In [47]:
# 01. What Hospital/s receives the most 'Urgent' Admission Type

# Long method

# urgent_admissions = df[df['Admission Type'] == 'Urgent']
# urgent_admissions_by_hospital = urgent_admissions['Hospital'].value_counts()
# urgent_admissions_by_hospital.head(2)

# Alternative

df[df['Admission Type'] == 'Urgent'].value_counts('Hospital').head(3)

Unnamed: 0_level_0,count
Hospital,Unnamed: 1_level_1
Inc Smith,16
Ltd Smith,16
LLC Smith,15


In [48]:
# 02. How many 'Male' receives Medication of
# 'Aspirin' from Insurance Provider 'Cigna'

df.query("(Gender == 'Male') and (Medication == 'Aspirin') and (`Insurance Provider` == 'Cigna')").shape[0]

1144

In [49]:
# 03. How many 'Female' patients Age 50 to 55,
# admitted for "Emergency" received "Paracetamol" from the with blood type of A+

df.query("(Gender == 'Female') and (Age >= 50 and Age <= 55) and (`Admission Type` == 'Emergency') and (Medication == 'Paracetamol') and (`Blood Type`) == 'A+'").shape[0]

28

In [50]:
# 04. Group the patients by their hospital and blood type.
# For each group, find the average billing amount greater than or equal to 52000
# Arrange Billing Amount in descending order, show only Billing Amount column

x = df.groupby(['Hospital', 'Blood Type']).mean('Billing Amount').query("`Billing Amount` >= 52000")
x.sort_values('Billing Amount', ascending = False)['Billing Amount']

Unnamed: 0_level_0,Unnamed: 1_level_0,Billing Amount
Hospital,Blood Type,Unnamed: 2_level_1
Hernandez-Morton,AB+,52373.03237
Sons and Bailey,AB-,52271.66375
PLC Garner,B+,52181.83779
Walker-Garcia,A-,52170.03685
Ruiz-Anthony,AB+,52154.23772
George-Gonzalez,B+,52102.24089
Rocha-Carter,B-,52092.6699
"Briggs Walker Martinez, and",O-,52024.72644


In [51]:
# 05. Show me Doctors that has the name 'Smith', ignore cases, avoid duplicates
# arrange in Doctor Name in ascending order

x = df[df['Doctor'].str.contains('Smith', case = False)]
unique_smith_doctors = x['Doctor'].unique()

pd.DataFrame(unique_smith_doctors, columns = ['Doctor Name']).sort_values('Doctor Name')

Unnamed: 0,Doctor Name
272,Aaron Smith
203,Adam Smith
89,Adrian Smith
52,Albert Smith DVM
228,Alexander Smith
...,...
298,Whitney Smith
182,William Smith
67,Willie Smith
209,Yvette Smith


In [52]:
# 06. Show me Doctors that the last name of 'Smith', ignore cases, avoid duplicates
# arrange in Doctor Name in ascending order

# pd.set_option('display.max_rows', None) # Show all rows

x = df[df['Doctor'].str.lower().str.endswith('smith')]
unique_smith_doctors = x['Doctor'].unique()

pd.DataFrame(unique_smith_doctors, columns = ['Doctor Name']).sort_values('Doctor Name')

Unnamed: 0,Doctor Name
259,Aaron Smith
195,Adam Smith
86,Adrian Smith
219,Alexander Smith
239,Alexandra Smith
...,...
284,Whitney Smith
174,William Smith
66,Willie Smith
201,Yvette Smith


In [53]:
# 07. Can you tell me what is the name of the patient that
# spend the most of days in any given hospital?

x = df.query("`Medical Condition` == 'Cancer'")
x.sort_values('Days Spent', ascending = False)[['Name','Date of Admission',
                                                'Discharge Date', 'Days Spent', 'Medical Condition']].head(5)

Unnamed: 0,Name,Date of Admission,Discharge Date,Days Spent,Medical Condition
9,christopher berg,2021-04-23,2021-06-22,60,Cancer
39113,mr. antonio davis dvm,2021-07-12,2021-08-11,30,Cancer
19628,jerry tucker,2019-10-12,2019-11-11,30,Cancer
19886,james davis,2022-10-29,2022-11-28,30,Cancer
1614,james white,2020-06-02,2020-07-02,30,Cancer


In [69]:
# 08. What is the total billing amount of patients admitted in Emergency due to Obesity that are Male

x = df.groupby(['Admission Type', 'Medical Condition', 'Gender']).sum('Total Billing')
x.query("(`Admission Type` == 'Emergency') and (`Medical Condition` == 'Obesity')")['Billing Amount']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Billing Amount
Admission Type,Medical Condition,Gender,Unnamed: 3_level_1
Emergency,Obesity,Female,41112420.0
Emergency,Obesity,Male,39584080.0


In [55]:
# 09. What is the total billing amount of patients admitted in Emergency due to Obesity that are Male, with Aetna as Insurance Provider

x = df.groupby(['Admission Type', 'Medical Condition', 'Gender', 'Insurance Provider']).sum('Total Billing')
y = x.query("(`Admission Type` == 'Emergency') and (`Medical Condition` == 'Obesity') and (`Insurance Provider` == 'Aetna')")['Billing Amount']
y.sort_values(ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Billing Amount
Admission Type,Medical Condition,Gender,Insurance Provider,Unnamed: 4_level_1
Emergency,Obesity,Male,Aetna,8518124.0
Emergency,Obesity,Female,Aetna,8425255.0


In [56]:
# 10. How many test results are 'Inconclusive' that are 'Male', ages 45 to 50

x = df.query("(`Test Results` == 'Inconclusive') and (Gender == 'Male')")
y = x[x['Age'].between(45,50)].shape[0]
y

789

In [61]:
# 11. Identify patients over the age of 80 with a medical condition of 'Cancer' or 'Diabetes'
#     who were admitted urgently, have Medicare as their insurance provider, were discharged
#     less than 15 days from their admission date.

#     Additionally, calculate the average billing amount
#     for these patients and the doctor responsible for their care, filter more
#     with Billing Amoung >= 35000

a = df.query("(Age > 80) and (`Medical Condition` == 'Cancer' or `Medical Condition` == 'Diabetes')")
b = a.query("(`Admission Type` == 'Urgent') and (`Insurance Provider` == 'Medicare')")
c = b.query("`Days Spent` < 15")
d = c.groupby(['Name', 'Doctor']).mean('Billing Amount').loc[:,['Billing Amount','Days Spent']]
e = d[d['Billing Amount'] >= 40000]
e.sort_values(['Billing Amount', 'Days Spent'], ascending = [False, False])

Unnamed: 0_level_0,Unnamed: 1_level_0,Billing Amount,Days Spent
Name,Doctor,Unnamed: 2_level_1,Unnamed: 3_level_1
marie rodriguez,Andre Rogers,50529.41285,4.0
dr. luis arias dds,Calvin Holmes,48009.06895,11.0
natalie robbins md,Jason Hale,47868.21696,1.0
crystal rich,Kayla Jones,45292.62449,5.0
christopher hines,Thomas Baker,44646.64326,11.0
kimberly baker,Joseph Moran,43581.10615,11.0
cathy schmitt,Bryan Simmons,43315.68358,10.0
heather willis,Victor Johnson Jr.,42114.81549,7.0


In [66]:
# 12. Find patients under the age of 40 who were admitted for a medical condition of 'Obesity' or 'Cancer',
# were prescribed either 'Ibuprofen' or 'Aspirin', and
# had a discharge test result of 'Normal.' filter to include only those admitted before 2022

# For these patients,
# calculate the total billing amount and
# and sort the results by the highest total billing amount."

a = df[(df['Age'] < 40) & (df['Medical Condition'].isin(['Obesity', 'Cancer']))]
b = a[a['Medication'].isin(['Ibuprofen', 'Aspirin'])]
c = b[(b['Test Results'] == 'Normal') & (b['Date of Admission'] < '2022-01-01')]
d = c.groupby('Name').sum('Total Billing Amount')
d.sort_values('Billing Amount', ascending = False)['Billing Amount']

Unnamed: 0_level_0,Billing Amount
Name,Unnamed: 1_level_1
stephanie elliott,97261.262780
michelle santiago,91203.464520
rhonda church,90928.888100
jill martinez,90307.801400
angelica webb,89717.618200
...,...
jacob taylor,1251.296759
derek nguyen,1179.865905
kimberly alvarez,919.437512
diane pierce,396.992871
