### Import packages

In [1]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

### Loading Datasets

In [40]:
patients = pd.read_csv('../data/01_raw/patients.csv')
patient_gender = pd.read_csv('../data/01_raw/patient_gender.csv')
symptoms = pd.read_csv('../data/01_raw/symptoms.csv')
encounters = pd.read_parquet('../data/01_raw/encounters.parquet')
conditions = pd.read_excel('../data/01_raw/conditions.xlsx')
medications = pd.read_csv('../data/01_raw/medications.csv')

In [18]:
patients_with_gender = patients.drop(columns='GENDER', axis=1).merge(patient_gender, left_on="PATIENT_ID", right_on="Id", how="left")
display(patients_with_gender.head())

Unnamed: 0,PATIENT_ID,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME,Id,GENDER
0,54f1059e-6250-3949-6dd0-1dda9b85d22a,2002-01-21,,999-28-3364,S99987398,X44428214X,Ms.,Fredricka415,Crist667,,...,Tarrant County,48439.0,76104,0.329154,-97.399553,9990.08,0.0,35460,54f1059e-6250-3949-6dd0-1dda9b85d22a,F
1,92675303-ca5b-136a-169b-e764c5753f06,1997-04-16,,999-62-9859,S99999770,X75016560X,Mr.,Lorenzo669,Urrutia540,,...,Tarrant County,48439.0,76006,0.328145,-97.068885,10936.8,0.0,88407,92675303-ca5b-136a-169b-e764c5753f06,M
2,a0b63e97-b6fd-5fe1-8f2d-2bec915efa97,1993-02-04,,999-53-6488,S99979170,X60079936X,Mr.,Luther918,MacGyver246,,...,Harris County,48339.0,77357,0.300834,-95.649706,10662.16,0.0,41800,a0b63e97-b6fd-5fe1-8f2d-2bec915efa97,M
3,abc59f62-dc5a-5095-1141-80b4ee8be73b,1995-05-23,,999-37-1058,S99981031,X14759314X,Mrs.,Jacque955,Jones311,,...,McLennan County,48309.0,76655,0.315173,-97.292558,11455.93,0.0,41915,abc59f62-dc5a-5095-1141-80b4ee8be73b,F
4,28d7b56c-6056-d0a2-2991-39d6e917216c,1993-11-13,,999-31-6091,S99958903,X63033472X,Mrs.,Angela104,Stanton715,,...,Hockley County,48219.0,79336,0.337037,-102.361829,12449.11,0.0,67198,28d7b56c-6056-d0a2-2991-39d6e917216c,F


#### 1. How many distinct patients are in the dataset?

In [19]:
num_distinct_patients = patients["PATIENT_ID"].nunique()
print(f"Total distinct patients: {num_distinct_patients}")

Total distinct patients: 10000


#### 2. Plot the distinct medications over time using a python plotting library.

In [45]:

medications["START"] = pd.to_datetime(medications["START"], errors="coerce")  
medications = medications.dropna(subset=["START"])

# filter data for last 3 years
medications = medications[medications["START"] >= "2021-01-01"]


medications = medications.drop_duplicates()

#'Month-Year' column
medications["MONTH_YEAR"] = medications["START"].dt.strftime("%b-%Y")

medication_counts = medications.groupby(["MONTH_YEAR", "DESCRIPTION"])["PATIENT"].count().reset_index()

# Rename column for clarity in graph
medication_counts.rename(columns={"PATIENT": "COUNT"}, inplace=True)

medication_counts["MONTH_YEAR"] = pd.to_datetime(medication_counts["MONTH_YEAR"], format="%b-%Y")
medication_counts = medication_counts.sort_values("MONTH_YEAR")

medication_counts["MONTH_YEAR"] = medication_counts["MONTH_YEAR"].dt.strftime("%b-%Y")


fig = px.line(medication_counts, 
              x="MONTH_YEAR", 
              y="COUNT", 
              color="DESCRIPTION",
              title="Medication Usage Over Time (Filtered & Aggregated by Month)",
              labels={"COUNT": "Number of Uses", "MONTH_YEAR": "Month-Year", "DESCRIPTION": "Medication Type"},
              markers=True)

# Rotate x-axis labels for better readability
fig.update_layout(xaxis=dict(tickangle=-45))

fig.show()

### Pie Chart of Patients Across Race & Gender

In [47]:
# Group data by Race & Gender
race_gender_counts = patients_with_gender.groupby(["RACE", "GENDER"]).size().reset_index(name="count")

# Pie chart
fig = px.pie(race_gender_counts, values="count", names="RACE", color="RACE",
             title="Patient Distribution by Race and Gender", hole=0.4)
fig.show()

### Create a piechart indicating the percentage of patients across each racial category and gender.

In [60]:

def extract_symptom_values(symptoms_str):
    symptom_dict = dict(item.split(":") for item in symptoms_str.split(";"))
    return {k: int(v) for k, v in symptom_dict.items()}  # Convert values to int

# Apply function to create new columns
df_symptoms = symptoms["SYMPTOMS"].apply(extract_symptom_values).apply(pd.Series)
df = pd.concat([symptoms, df_symptoms], axis=1)  # Merge extracted symptoms


filtered_patients = df[
    (df["Rash"] >= 30) &
    (df["Joint Pain"] >= 30) &
    (df["Fatigue"] >= 30) &
    (df["Fever"] >= 30)
]


total_patients = df["PATIENT"].nunique()
matching_patients = filtered_patients["PATIENT"].nunique()

percentage = (matching_patients / total_patients) * 100 if total_patients > 0 else 0

print(f"Percentage of patients with all 4 symptoms ≥ 30: {percentage:.4f}%")

Percentage of patients with all 4 symptoms ≥ 30: 0.0000%
