# Data Extraction - Part 1

In [123]:
# Imports

import numpy as np
import pandas as pd
from datetime import datetime

In [124]:
# Load Dataset

df = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/healthcare_dataset.csv',
                 parse_dates = ['Date of Admission', 'Discharge Date'])
df.head(3)

  df = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/healthcare_dataset.csv',


Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.28131,328,Urgent,2 02 2024,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.32729,265,Emergency,26 08 2019,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.09608,205,Emergency,7 10 2022,Aspirin,Normal


In [125]:
# Let us fixed first inconsistencies in our data

df['Name'] = df['Name'].str.lower()
df.head(3)

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,bobby jackson,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.28131,328,Urgent,2 02 2024,Paracetamol,Normal
1,leslie terry,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.32729,265,Emergency,26 08 2019,Ibuprofen,Inconclusive
2,danny smith,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.09608,205,Emergency,7 10 2022,Aspirin,Normal


In [126]:
# Check info or check for missing data

missing = df.isnull().sum().rename('Missing')
missing
# missing.to_csv('missing values.csv', header = True)

Unnamed: 0,Missing
Name,0
Age,0
Gender,0
Blood Type,0
Medical Condition,0
Date of Admission,0
Doctor,0
Hospital,0
Insurance Provider,0
Billing Amount,0


# Extract the following information - Part 1

In [127]:
# 1. How many Insurance Provider are there in the dataset

df['Insurance Provider'].nunique()

5

In [128]:
# 2. Which Medical Condition is the most prevalent?

df['Medical Condition'].value_counts()

Unnamed: 0_level_0,count
Medical Condition,Unnamed: 1_level_1
Arthritis,9308
Diabetes,9304
Hypertension,9245
Obesity,9231
Cancer,9227
Asthma,9185


In [129]:
# 3. What is the median value of Billing Amount?

df['Billing Amount'].median()

25538.06938

In [130]:
# 4. What room number is used least for emergency?

x = df.groupby(['Admission Type', 'Room Number']).count()
y = x.query("`Admission Type` == 'Emergency'")
y['Name'].sort_values()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name
Admission Type,Room Number,Unnamed: 2_level_1
Emergency,189,24
Emergency,101,27
Emergency,398,29
Emergency,460,29
Emergency,259,29
Emergency,...,...
Emergency,393,63
Emergency,420,64
Emergency,287,64
Emergency,486,67


In [131]:
# 05. Give me the number of Male and Female Cancer Patients.

df.query("`Medical Condition` == 'Cancer'").value_counts('Gender')
# df[df['Medical Condition'] == 'Cancer'].value_counts('Gender')

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,4625
Female,4602


In [132]:
# 05. Give me the number of Male Obesity Patients for each Insurance Provider

df.query("(Gender == 'Male') and (`Medical Condition` == 'Obesity')").value_counts('Insurance Provider')

Unnamed: 0_level_0,count
Insurance Provider,Unnamed: 1_level_1
Blue Cross,953
Medicare,946
Aetna,916
UnitedHealthcare,910
Cigna,884


In [133]:
# 06. Give me the average billing amount for Cancer Patients, Female, Blood Type of A+ with Inconclusive results

x = df.query("(`Medical Condition` == 'Cancer') and (Gender == 'Female')")
y = x.query("(`Blood Type` == 'A+') and (`Test Results` == 'Inconclusive')")
y['Billing Amount'].mean()

26013.69686546455

## Extract the following information - Part 2

In [134]:
# 01. What Hospital/s receives the most 'Urgent' Admission Type

# Long method

# urgent_admissions = df[df['Admission Type'] == 'Urgent']
# urgent_admissions_by_hospital = urgent_admissions['Hospital'].value_counts()
# urgent_admissions_by_hospital.head(2)

# Alternative

df[df['Admission Type'] == 'Urgent'].value_counts('Hospital').head(2)

Unnamed: 0_level_0,count
Hospital,Unnamed: 1_level_1
Inc Smith,16
Ltd Smith,16


In [135]:
# 02. How many 'Male' Gender receives Medication of 'Aspirin' from Insurance Provider 'Cigna'

df.query("(Gender == 'Male') and (Medication == 'Ibuprofen') and (`Insurance Provider` == 'Cigna')").shape[0]

1094

In [136]:
# 03. How many 'Female' patients Age 50 to 55, admitted for "Emergency" received "Paracetamol" from the with blood type of A+

df.query("(Gender == 'Female') and (Age >= 50 and Age <= 55) and (`Admission Type` == 'Emergency') and (Medication == 'Paracetamol') and (`Blood Type`) == 'A+'").shape[0]

28

In [137]:
# 04. Group the patients by their hospital and blood type.
# For each group, find the average billing amount greater than or equal to 52000
# Arrange Billing Amount in descending order, show only Billing Amount column

x = df.groupby(['Hospital', 'Blood Type']).mean('Billing Amount').query("`Billing Amount` >= 52000")
x.sort_values('Billing Amount', ascending = False)['Billing Amount']

Unnamed: 0_level_0,Unnamed: 1_level_0,Billing Amount
Hospital,Blood Type,Unnamed: 2_level_1
Hernandez-Morton,AB+,52373.03237
Sons and Bailey,AB-,52271.66375
PLC Garner,B+,52181.83779
Walker-Garcia,A-,52170.03685
Ruiz-Anthony,AB+,52154.23772
George-Gonzalez,B+,52102.24089
Rocha-Carter,B-,52092.6699
"Briggs Walker Martinez, and",O-,52024.72644


In [138]:
# 05. Show me Doctors that has the name 'Smith', ignore cases, avoid duplicates
# arrange in Doctor Name in ascending order

x = df[df['Doctor'].str.contains('Smith', case = False)]
unique_smith_doctors = x['Doctor'].unique()

pd.DataFrame(unique_smith_doctors, columns = ['Doctor Name']).sort_values('Doctor Name')

Unnamed: 0,Doctor Name
272,Aaron Smith
203,Adam Smith
89,Adrian Smith
52,Albert Smith DVM
228,Alexander Smith
...,...
298,Whitney Smith
182,William Smith
67,Willie Smith
209,Yvette Smith


In [139]:
# 06. Show me Doctors that the last name of 'Smith', ignore cases, avoid duplicates
# arrange in Doctor Name in ascending order

# pd.set_option('display.max_rows', None) # Show all rows

x = df[df['Doctor'].str.lower().str.endswith('smith')]
unique_smith_doctors = x['Doctor'].unique()

pd.DataFrame(unique_smith_doctors, columns = ['Doctor Name']).sort_values('Doctor Name')

Unnamed: 0,Doctor Name
259,Aaron Smith
195,Adam Smith
86,Adrian Smith
219,Alexander Smith
239,Alexandra Smith
...,...
284,Whitney Smith
174,William Smith
66,Willie Smith
201,Yvette Smith


In [140]:
# 07. Can you tell me what is the name of the patient that
# spend the most of days in any given hospital?

df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce') # convert to proper format
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], errors='coerce') # convert to proper format

df['Days Spent'] = (df['Discharge Date']) - (df['Date of Admission'])

x = df.query("`Medical Condition` == 'Cancer'")
x.sort_values('Days Spent', ascending = False)[['Name','Date of Admission',
                                                'Discharge Date', 'Days Spent', 'Medical Condition']].head(5)

Unnamed: 0,Name,Date of Admission,Discharge Date,Days Spent,Medical Condition
18826,janet hanson,2023-12-17,2024-12-01,350 days,Cancer
38849,robert robbins,2021-12-21,2022-12-01,345 days,Cancer
13030,cheyenne mcbride,2020-12-25,2021-12-01,341 days,Cancer
22526,andrew lawson,2020-01-01,2020-12-01,335 days,Cancer
29135,pamela golden,2022-12-31,2023-12-01,335 days,Cancer


In [141]:
# 08. What is the total billing amount of patients admitted in Emergency due to Obesity that are Male

x = df.groupby(['Admission Type', 'Medical Condition', 'Gender']).sum('Total Billing')
x.query("(`Admission Type` == 'Emergency') and (`Medical Condition` == 'Obesity')")['Billing Amount']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Billing Amount
Admission Type,Medical Condition,Gender,Unnamed: 3_level_1
Emergency,Obesity,Female,41112420.0
Emergency,Obesity,Male,39584080.0


In [142]:
# 09. What is the total billing amount of patients admitted in Emergency due to Obesity that are Male, with Aetna as Insurance Provider

x = df.groupby(['Admission Type', 'Medical Condition', 'Gender', 'Insurance Provider']).sum('Total Billing')
y = x.query("(`Admission Type` == 'Emergency') and (`Medical Condition` == 'Obesity') and (`Insurance Provider` == 'Aetna')")['Billing Amount']
y.sort_values(ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Billing Amount
Admission Type,Medical Condition,Gender,Insurance Provider,Unnamed: 4_level_1
Emergency,Obesity,Male,Aetna,8518124.0
Emergency,Obesity,Female,Aetna,8425255.0


In [143]:
# 10. How many test results are 'Inconclusive' that are 'Male', ages 45 to 50

x = df.query("(`Test Results` == 'Inconclusive') and (Gender == 'Male')")
y = x[x['Age'].between(45,50)].shape[0]
y

789