# Analysis for Holmusk data set

## Data processing

In [41]:
# Import some libraries
import pandas as pd
import numpy as np
import os

In [42]:
cwd = os.getcwd()
print(cwd)

C:\Users\USER\Downloads\Holmusk_analysis


In [63]:
# Import our datasets
bill_id = pd.read_csv(r'C:/Users/USER/Downloads/Holmusk_analysis/bill_id.csv')
demographics = pd.read_csv(r'C:/Users/USER/Downloads/Holmusk_analysis/demographics.csv')
clinical_data = pd.read_csv(r'C:/Users/USER/Downloads/Holmusk_analysis/clinical_data.csv')

# Rename bill_id "patient_id" to avoid conflict
bill_id.rename(columns={"patient_id": "pt_id"}, inplace=True)

# Drop already existing columns from clinical_data
clinical_data = clinical_data.drop(columns=["date_of_admission"])

In [64]:
print("bill_id columns: " + str(bill_id.columns))
print("demographics columns: " + str(demographics.columns))
print("clinical_data columns: " + str(clinical_data.columns))
print(" ")
print("bill_id length: " + str(len(bill_id)))
print("demographics length: " + str(len(demographics)))
print("clinical_data length: " + str(len(clinical_data)))

bill_id columns: Index(['bill_id', 'pt_id', 'date_of_admission', 'amount'], dtype='object')
demographics columns: Index(['patient_id', 'gender', 'race', 'resident_status', 'date_of_birth'], dtype='object')
clinical_data columns: Index(['id', 'date_of_discharge', 'medical_history_1', 'medical_history_2',
       'medical_history_3', 'medical_history_4', 'medical_history_5',
       'medical_history_6', 'medical_history_7', 'preop_medication_1',
       'preop_medication_2', 'preop_medication_3', 'preop_medication_4',
       'preop_medication_5', 'preop_medication_6', 'symptom_1', 'symptom_2',
       'symptom_3', 'symptom_4', 'symptom_5', 'lab_result_1', 'lab_result_2',
       'lab_result_3', 'weight', 'height'],
      dtype='object')
 
bill_id length: 13600
demographics length: 3000
clinical_data length: 3400


In [66]:
# Find common patient_ids
patient_id_dup = [list(bill_id["pt_id"]), list(demographics["patient_id"]), list(clinical_data["id"])]
unique_id = set(patient_id_dup[0]).intersection(*patient_id_dup)
print(len(unique_id))

3000


Looks like the ids in demographics contain all the unique ids. Combine all the datasets into 1 big dataset

In [73]:
# bill_id is already in sorted order. Lets sort demographics and clinical_data
raw_df = demographics.sort_values("patient_id").copy(deep=True)
clinical_data = clinical_data.sort_values("id")

raw_df = raw_df.join(bill_id.loc[bill_id["pt_id"].isin(raw_df["patient_id"])])
raw_df = raw_df.join(clinical_data.loc[clinical_data["id"].isin(raw_df["patient_id"])])

In [74]:
print(raw_df.columns)

Index(['patient_id', 'gender', 'race', 'resident_status', 'date_of_birth',
       'bill_id', 'pt_id', 'date_of_admission', 'amount', 'id',
       'date_of_discharge', 'medical_history_1', 'medical_history_2',
       'medical_history_3', 'medical_history_4', 'medical_history_5',
       'medical_history_6', 'medical_history_7', 'preop_medication_1',
       'preop_medication_2', 'preop_medication_3', 'preop_medication_4',
       'preop_medication_5', 'preop_medication_6', 'symptom_1', 'symptom_2',
       'symptom_3', 'symptom_4', 'symptom_5', 'lab_result_1', 'lab_result_2',
       'lab_result_3', 'weight', 'height'],
      dtype='object')


In [75]:
raw_df["gender"]

2547    Female
2645    Female
1800      Male
1552    Female
2295      Male
         ...  
2960    Female
638       Male
1741      Male
1806      Male
2743    Female
Name: gender, Length: 3000, dtype: object

Now let's clean up the data and make it code-readable

In [107]:
# Rectify gender - no nulls
# Male = 1, female = 0
gender_df = raw_df.copy(deep=True)
# print(gender_df["gender"].isnull().sum())

gender_df["gender"] = np.where(gender_df["gender"].str.lower().str.startswith("m"), 1, 0)

print(len(gender_df.loc[gender_df["gender"] == 1]))

raw_df["gender"] = gender_df["gender"]

1503


In [110]:
# Add in age column
age_df = raw_df.copy(deep=True)
# print(age_df["date_of_birth"].isnull().sum()) # No nulls for both
# print(age_df["date_of_admission"].isnull().sum())

doa = age_df["date_of_admission"].str.split('/', expand=True).astype(int)
dob = age_df["date_of_birth"].str.split('/', expand=True).astype(int)

age_at_admission = [a - b for a, b in zip(doa[2], dob[2])]

print(len(age_at_admission))

raw_df["age_at_admission"] = age_at_admission

3000


In [112]:
# Check all newly-formatted columns
raw_df["age_at_admission"]

2547    31
2645    72
1800    39
1552    37
2295    70
        ..
2960    37
638     52
1741    76
1806    61
2743    57
Name: age_at_admission, Length: 3000, dtype: int64

Lets export this full dataset, and then shift the columns

In [34]:
# raw_df.to_csv("raw_df.csv")