In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load COVID-19 data
covid_data = pd.read_csv("data_covid.csv")

# Step 2: Preprocessing
# Convert measurement_date to datetime
covid_data['measurement_date'] = pd.to_datetime(covid_data['measurement_date'])

# Keep only the measurements from the first day
first_day_data = covid_data[covid_data['measurement_date'] == covid_data.groupby('person_id')['measurement_date'].transform('min')]

# Pivot the data so that each measurement is a separate column
first_day_pivot = first_day_data.pivot_table(index=['person_id', 'current_age', 'category', 'race_name', 'gen_name'], 
                                             columns='measurement_name', 
                                             values='value_as_number').reset_index()

# Identify vital signs and lab measurements
vital_signs = ['Diastolic blood pressure', 'Body temperature', 'Systolic blood pressure',
               'Body weight', 'Respiratory rate', 'Oxygen saturation in Arterial blood']

lab_measurements = ['Diastolic blood pressure', 'Body temperature', 'Respiratory rate',
                    'Oxygen saturation in Arterial blood', 'Systolic blood pressure',
                    'Body weight', 'Heart rate',
                    'Hematocrit [Volume Fraction] of Blood by Automated count',
                    'Erythrocytes [#/volume] in Blood by Automated count',
                    'Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma',
                    'Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma',
                    'Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma',
                    'Bilirubin.total [Mass/volume] in Serum or Plasma',
                    'Albumin [Mass/volume] in Serum or Plasma',
                    'MCHC [Mass/volume] by Automated count',
                    'Hemoglobin [Mass/volume] in Blood',
                    'Platelets [#/volume] in Blood by Automated count',
                    'Glomerular filtration rate/1.73 sq M.predicted [Volume Rate/Area] in Serum, Plasma or Blood by Creatinine-based formula (MDRD)',
                    'Protein [Mass/volume] in Serum or Plasma',
                    'MCH [Entitic mass] by Automated count',
                    'MCV [Entitic volume] by Automated count',
                    'Leukocytes [#/volume] in Blood by Automated count'
                   ]

# Step 3: Preparing data for classification
# Mild Classifier: Use only vital signs
X_binary = first_day_pivot[vital_signs].values
y_binary = (first_day_pivot['category'] != 'mild').astype(int).values

# Severe Classifier: Use all features
X_severe = first_day_pivot.drop(['person_id', 'current_age', 'category', 'race_name', 'gen_name'], axis=1).values
y_severe = first_day_pivot['category'].apply(lambda x: 1 if x == 'severe' else 0).values

# Step 4: Impute missing values with median
imputer = SimpleImputer(strategy='median')
X_binary = imputer.fit_transform(X_binary)
X_severe = imputer.fit_transform(X_severe)

# Step 5: Splitting data into train and test sets
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)
X_train_severe, X_test_severe, y_train_severe, y_test_severe = train_test_split(X_severe, y_severe, test_size=0.2, random_state=42)

# Step 6: Training classifiers
# Binary Classifier: Decision Tree
binary_classifier = DecisionTreeClassifier()
binary_classifier.fit(X_train_binary, y_train_binary)

# Severe Classifier: Decision Tree
severe_classifier = DecisionTreeClassifier()
severe_classifier.fit(X_train_severe, y_train_severe)

from sklearn.metrics import classification_report, accuracy_score

# Step 7: Combine predictions from mild and severe classifiers
# Get predictions from mild classifier
binary_predictions = binary_classifier.predict(X_test_binary)
severe_predictions = severe_classifier.predict(X_test_severe)

# Step 8: Evaluate classifiers
# Mild Classifier evaluation
print("Mild Classifier Results:")
binary_accuracy = accuracy_score(y_test_binary, binary_predictions)
print("Accuracy:", binary_accuracy)
print(classification_report(y_test_binary, binary_predictions))

# Severe Classifier evaluation
print("\nSevere Classifier Results:")
severe_accuracy = accuracy_score(y_test_severe, severe_predictions)
print("Accuracy:", severe_accuracy)
print(classification_report(y_test_severe, severe_predictions))


# Step 7: Combine predictions from mild and severe classifiers
# Get predictions from mild classifier
binary_predictions = binary_classifier.predict(X_test_binary)


# Filter severe predictions based on mild predictions
X_test_severe_final = X_test_severe[binary_predictions == 1]
y_test_severe_final = y_test_severe[binary_predictions == 1]

# Get predictions from severe classifier
severe_predictions = severe_classifier.predict(X_test_severe_final)

# Step 8: Evaluate final classifier
final_accuracy = accuracy_score(y_test_severe_final, severe_predictions)
print("Final Classifier Accuracy:", final_accuracy)
print(classification_report(y_test_severe_final, severe_predictions))

Mild Classifier Results:
Accuracy: 0.6345029239766082
              precision    recall  f1-score   support

           0       0.64      0.66      0.65       177
           1       0.62      0.61      0.62       165

    accuracy                           0.63       342
   macro avg       0.63      0.63      0.63       342
weighted avg       0.63      0.63      0.63       342


Severe Classifier Results:
Accuracy: 0.7748538011695907
              precision    recall  f1-score   support

           0       0.91      0.83      0.87       300
           1       0.25      0.40      0.31        42

    accuracy                           0.77       342
   macro avg       0.58      0.62      0.59       342
weighted avg       0.83      0.77      0.80       342

Final Classifier Accuracy: 0.7125
              precision    recall  f1-score   support

           0       0.88      0.76      0.82       135
           1       0.26      0.44      0.32        25

    accuracy                         

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load COVID-19 data
covid_data = pd.read_csv("data_covid.csv")

# Step 2: Preprocessing
# Convert measurement_date to datetime
covid_data['measurement_date'] = pd.to_datetime(covid_data['measurement_date'])

# Keep only the measurements from the first day
first_day_data = covid_data[covid_data['measurement_date'] == covid_data.groupby('person_id')['measurement_date'].transform('min')]

# Pivot the data so that each measurement is a separate column
first_day_pivot = first_day_data.pivot_table(index=['person_id', 'current_age', 'category', 'race_name', 'gen_name'], 
                                             columns='measurement_name', 
                                             values='value_as_number').reset_index()

# Identify vital signs and lab measurements
vital_signs = ['Diastolic blood pressure', 'Body temperature', 'Systolic blood pressure',
               'Body weight', 'Respiratory rate', 'Oxygen saturation in Arterial blood']

lab_measurements = ['Diastolic blood pressure', 'Body temperature', 'Respiratory rate',
                    'Oxygen saturation in Arterial blood', 'Systolic blood pressure',
                    'Body weight', 'Heart rate',
                    'Hematocrit [Volume Fraction] of Blood by Automated count',
                    'Erythrocytes [#/volume] in Blood by Automated count',
                    'Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma',
                    'Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma',
                    'Alkaline phosphatase [Enzymatic activity/volume] in Serum or Plasma',
                    'Bilirubin.total [Mass/volume] in Serum or Plasma',
                    'Albumin [Mass/volume] in Serum or Plasma',
                    'MCHC [Mass/volume] by Automated count',
                    'Hemoglobin [Mass/volume] in Blood',
                    'Platelets [#/volume] in Blood by Automated count',
                    'Glomerular filtration rate/1.73 sq M.predicted [Volume Rate/Area] in Serum, Plasma or Blood by Creatinine-based formula (MDRD)',
                    'Protein [Mass/volume] in Serum or Plasma',
                    'MCH [Entitic mass] by Automated count',
                    'MCV [Entitic volume] by Automated count',
                    'Leukocytes [#/volume] in Blood by Automated count'
                   ]

# Step 3: Preparing data for classification
# Mild Classifier: Use only vital signs
X_binary = first_day_pivot[vital_signs].values
y_binary = (first_day_pivot['category'] != 'mild').astype(int).values

# Severe Classifier: Use all features
X_severe = first_day_pivot.drop(['person_id', 'current_age', 'category', 'race_name', 'gen_name'], axis=1).values
y_severe = first_day_pivot['category'].apply(lambda x: 1 if x == 'severe' else 0).values

# Step 4: Impute missing values with median
imputer = SimpleImputer(strategy='median')
X_binary = imputer.fit_transform(X_binary)
X_severe = imputer.fit_transform(X_severe)

# Step 5: Splitting data into train and test sets
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)
X_train_severe, X_test_severe, y_train_severe, y_test_severe = train_test_split(X_severe, y_severe, test_size=0.2, random_state=42)

# Step 6: Training classifiers
binary_classifier = LogisticRegression()
binary_classifier.fit(X_train_binary, y_train_binary)

# Severe Classifier: Logistic Regression
severe_classifier = LogisticRegression()
severe_classifier.fit(X_train_severe, y_train_severe)

# Step 7: Evaluation
# Mild Classifier evaluation
binary_predictions = binary_classifier.predict(X_test_binary)
binary_accuracy = accuracy_score(y_test_binary, binary_predictions)
print("Mild Classifier Accuracy:", binary_accuracy)
print(classification_report(y_test_binary, binary_predictions))

# Severe Classifier evaluation
severe_predictions = severe_classifier.predict(X_test_severe)
severe_accuracy = accuracy_score(y_test_severe, severe_predictions)

print("Severe Classifier Accuracy:", severe_accuracy)
print(classification_report(y_test_severe, severe_predictions))

# Step 8: Combine predictions from binary and severe classifiers
# Get predictions from mild classifier
binary_predictions = binary_classifier.predict(X_test_binary)

# Filter severe predictions based on mild predictions
X_test_severe_final = X_test_severe[binary_predictions == 1]
y_test_severe_final = y_test_severe[binary_predictions == 1]

# Get predictions from severe classifier
severe_predictions = severe_classifier.predict(X_test_severe_final)


# Step 7: Combine predictions from mild and severe classifiers
# Get predictions from mild classifier
binary_predictions = binary_classifier.predict(X_test_binary)

# Filter severe predictions based on mild predictions
X_test_severe_final = X_test_severe[binary_predictions == 1]
y_test_severe_final = y_test_severe[binary_predictions == 1]

# Get predictions from severe classifier
severe_predictions = severe_classifier.predict(X_test_severe_final)

# Step 8: Evaluate final classifier
final_accuracy = accuracy_score(y_test_severe_final, severe_predictions)
print("Final Classifier Accuracy:", final_accuracy)
print(classification_report(y_test_severe_final, severe_predictions))

Mild Classifier Accuracy: 0.5409356725146199
              precision    recall  f1-score   support

           0       0.54      0.78      0.64       177
           1       0.55      0.28      0.37       165

    accuracy                           0.54       342
   macro avg       0.54      0.53      0.51       342
weighted avg       0.54      0.54      0.51       342

Severe Classifier Accuracy: 0.8801169590643275
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       300
           1       1.00      0.02      0.05        42

    accuracy                           0.88       342
   macro avg       0.94      0.51      0.49       342
weighted avg       0.89      0.88      0.83       342

Final Classifier Accuracy: 0.8372093023255814
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        71
           1       1.00      0.07      0.12        15

    accuracy                           0.84 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
# Group by COVID Severity and Sex
grouped_sex = first_day_pivot.groupby(['category', 'gen_name']).size().unstack(fill_value=0)
grouped_sex['Total'] = grouped_sex.sum(axis=1)

# Group by COVID Severity and Age Group
age_bins = [0, 18, 45, 65, 150]  # Define age bins
age_labels = ['< 18', '18 - 45', '46 - 65', '> 65']  # Define age group labels
first_day_pivot['Age_Group'] = pd.cut(first_day_pivot['current_age'], bins=age_bins, labels=age_labels)
grouped_age = first_day_pivot.groupby(['category', 'Age_Group']).size().unstack(fill_value=0)
grouped_age['Total'] = grouped_age.sum(axis=1)

# Group by COVID Severity and Race
grouped_race = first_day_pivot.groupby(['category', 'race_name']).size().unstack(fill_value=0)
grouped_race['Total'] = grouped_race.sum(axis=1)

# Print table
print("Table 1: Demographic Characteristics of Patient Population")
print("\tMild COVID\t\tModerate COVID\t\tSevere COVID")
print("Sex")
print(grouped_sex)

print("\nAge")
print(grouped_age)

print("\nRace")
print(grouped_race)

Table 1: Demographic Characteristics of Patient Population
	Mild COVID		Moderate COVID		Severe COVID
Sex
gen_name  FEMALE  MALE  Total
category                     
mild         519   463    982
moderate     279   235    514
severe        91   123    214

Age
Age_Group  < 18  18 - 45  46 - 65  > 65  Total
category                                      
mild        220      361      235   166    982
moderate     24      140      187   163    514
severe        2       22       64   126    214

Race
race_name  Asian  Black or African American  No matching concept  White  Total
category                                                                      
mild          63                         69                    8    842    982
moderate      41                         43                    3    427    514
severe        13                         20                    0    181    214


  grouped_age = first_day_pivot.groupby(['category', 'Age_Group']).size().unstack(fill_value=0)


End of Notebook