### COVID-19 Severity Classification
# 
##### This notebook demonstrates a machine learning workflow to classify COVID-19 patient severity levels based on clinical measurements. 
##### Two models are built: a "mild" classifier using vital signs and a "severe" classifier using all lab measurements. 
##### Finally, the models are combined to improve predictive performance.

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

##### Step 1: Load and Inspect Data
##### Load the COVID-19 dataset and convert date columns to datetime.

In [None]:
# Load COVID-19 data
covid_data = pd.read_csv("data_covid.csv")

# Convert measurement_date to datetime
covid_data['measurement_date'] = pd.to_datetime(covid_data['measurement_date'])

##### Step 2: Filter First Day Measurements
##### Keep only measurements from each patient's first day.

In [None]:
# Filter first day measurements
first_day_data = covid_data[covid_data['measurement_date'] == 
                            covid_data.groupby('person_id')['measurement_date'].transform('min')]

# Pivot data so each measurement is a separate column
first_day_pivot = first_day_data.pivot_table(
    index=['person_id', 'current_age', 'category', 'race_name', 'gen_name'],
    columns='measurement_name',
    values='value_as_number'
).reset_index()

##### Step 3: Feature Selection
##### Define vital signs and all lab measurements.

In [None]:
# Vital signs for "mild" classifier
vital_signs = [
    'Diastolic blood pressure', 'Body temperature', 'Systolic blood pressure',
    'Body weight', 'Respiratory rate', 'Oxygen saturation in Arterial blood'
]

# All lab measurements for "severe" classifier
lab_measurements = [
    'Diastolic blood pressure', 'Body temperature', 'Respiratory rate',
    'Oxygen saturation in Arterial blood', 'Systolic blood pressure',
    'Body weight', 'Heart rate',
    'Hematocrit [Volume Fraction] of Blood by Automated count',
    'Erythrocytes [#/volume] in Blood by Automated count',
    'Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma',
    'Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma',
    'Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma',
    'Bilirubin.total [Mass/volume] in Serum or Plasma',
    'Albumin [Mass/volume] in Serum or Plasma',
    'MCHC [Mass/volume] by Automated count',
    'Hemoglobin [Mass/volume] in Blood',
    'Platelets [#/volume] in Blood by Automated count',
    'Glomerular filtration rate/1.73 sq M.predicted [Volume Rate/Area] in Serum, Plasma or Blood by Creatinine-based formula (MDRD)',
    'Protein [Mass/volume] in Serum or Plasma',
    'MCH [Entitic mass] by Automated count',
    'MCV [Entitic volume] by Automated count',
    'Leukocytes [#/volume] in Blood by Automated count'
]


##### Step 4: Prepare Data for Classification
##### Define features and target variables for mild and severe classifiers.

In [None]:
# Mild classifier: only vital signs
X_binary = first_day_pivot[vital_signs].values
y_binary = (first_day_pivot['category'] != 'mild').astype(int).values

# Severe classifier: all features
X_severe = first_day_pivot.drop(['person_id','current_age','category','race_name','gen_name'], axis=1).values
y_severe = first_day_pivot['category'].apply(lambda x: 1 if x=='severe' else 0).values

##### Step 5: Impute Missing Values and Split Data
##### Fill missing values using median and split into train/test sets.

In [None]:
# Impute missing values
imputer = SimpleImputer(strategy='median')
X_binary = imputer.fit_transform(X_binary)
X_severe = imputer.fit_transform(X_severe)

# Split data
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    X_binary, y_binary, test_size=0.2, random_state=42
)
X_train_severe, X_test_severe, y_train_severe, y_test_severe = train_test_split(
    X_severe, y_severe, test_size=0.2, random_state=42
)

##### Step 6: Train Classifiers
##### Train Logistic Regression models for both mild and severe predictions.

In [None]:

# Train mild classifier
binary_classifier = LogisticRegression(max_iter=500)
binary_classifier.fit(X_train_binary, y_train_binary)

# Train severe classifier
severe_classifier = LogisticRegression(max_iter=500)
severe_classifier.fit(X_train_severe, y_train_severe)

##### Step 7: Evaluate Classifiers
##### Evaluate each classifier individually and then combined.

In [None]:
# Mild classifier evaluation
binary_predictions = binary_classifier.predict(X_test_binary)
print("Mild Classifier Accuracy:", accuracy_score(y_test_binary, binary_predictions))
print(classification_report(y_test_binary, binary_predictions))

# Severe classifier evaluation
severe_predictions = severe_classifier.predict(X_test_severe)
print("Severe Classifier Accuracy:", accuracy_score(y_test_severe, severe_predictions))
print(classification_report(y_test_severe, severe_predictions))

# Combined evaluation: only predict severe if mild predicts positive
X_test_severe_final = X_test_severe[binary_predictions==1]
y_test_severe_final = y_test_severe[binary_predictions==1]
severe_predictions_final = severe_classifier.predict(X_test_severe_final)

print("Final Combined Classifier Accuracy:", accuracy_score(y_test_severe_final, severe_predictions_final))
print(classification_report(y_test_severe_final, severe_predictions_final))

##### Step 8: Demographic Tables
##### Summarize patient demographics by COVID-19 severity.

In [None]:
# Group by sex
grouped_sex = first_day_pivot.groupby(['category','gen_name']).size().unstack(fill_value=0)
grouped_sex['Total'] = grouped_sex.sum(axis=1)

# Group by age group
age_bins = [0,18,45,65,150]
age_labels = ['< 18','18 - 45','46 - 65','> 65']
first_day_pivot['Age_Group'] = pd.cut(first_day_pivot['current_age'], bins=age_bins, labels=age_labels)
grouped_age = first_day_pivot.groupby(['category','Age_Group']).size().unstack(fill_value=0)
grouped_age['Total'] = grouped_age.sum(axis=1)

# Group by race
grouped_race = first_day_pivot.groupby(['category','race_name']).size().unstack(fill_value=0)
grouped_race['Total'] = grouped_race.sum(axis=1)

# Print tables
print("Table 1: Demographic Characteristics of Patient Population")
print("\nSex")
print(grouped_sex)
print("\nAge")
print(grouped_age)
print("\nRace")
print(grouped_race)

End of Notebook