In [1]:
import os
import sys

sys.path.append(os.path.dirname(os.getcwd()))

from src.load_covid19 import load_covid19
import pandas as pd



In [2]:
df = load_covid19()

Dataset already exists at /home/philipp/Dokumente/Master_Data_Science_Fernuni_Hagen/Projektpraktikum_Web_Science/covid-19-risiko-erkennung/src/../data/raw/covid19-dataset. Skipping download.


In [3]:
# Specify column types and map Boolean variables
bool_columns = ['PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 
                'HIPERTENSION', 'CARDIOVASCULAR', 'RENAL_CHRONIC', 'OTHER_DISEASE', 'OBESITY', 'TOBACCO', 
                'INTUBED', 'ICU']

missing_values=[97, 99]

# Convert Boolean columns to 'bool' and map values (Yes -> 1, No -> 0)
for col in bool_columns:
    df[col] = df[col].map({1: 1, 2: 0}).astype('boolean')
df.replace({col: missing_values for col in bool_columns if col in df.columns}, pd.NA, inplace=True)

df['SEX'] = df['SEX'].map({1: 'female', 2: 'male'})
df.replace('SEX', pd.NA, inplace=True)
df['SEX'] = df['SEX'].astype('category')

df['PATIENT_TYPE'] = df['PATIENT_TYPE'].map({1: 'returned home', 2: 'hospitalization'})
df.replace('PATIENT_TYPE', pd.NA, inplace=True)
df['PATIENT_TYPE'] = df['PATIENT_TYPE'].astype('category')

# DATE_DIED column missing value is '9999-99-99'
df['DATE_DIED'] = pd.to_datetime(df['DATE_DIED'].replace('9999-99-99', pd.NA), errors='coerce')

# Replace DATE_DIED with DIED (True if actual date, False otherwise)
df['DIED'] = df['DATE_DIED'].notna().astype('boolean')

# Drop the original DATE_DIED column
df.drop('DATE_DIED', axis=1, inplace=True)


print("\nColumns and Data Types:\n", df.dtypes)


Columns and Data Types:
 USMER                      int64
MEDICAL_UNIT               int64
SEX                     category
PATIENT_TYPE            category
INTUBED                  boolean
PNEUMONIA                boolean
AGE                        int64
PREGNANT                 boolean
DIABETES                 boolean
COPD                     boolean
ASTHMA                   boolean
INMSUPR                  boolean
HIPERTENSION             boolean
OTHER_DISEASE            boolean
CARDIOVASCULAR           boolean
OBESITY                  boolean
RENAL_CHRONIC            boolean
TOBACCO                  boolean
CLASIFFICATION_FINAL       int64
ICU                      boolean
DIED                     boolean
dtype: object


In [4]:
df

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,...,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU,DIED
0,2,1,female,returned home,,True,65,False,False,False,...,False,True,False,False,False,False,False,3,,True
1,2,1,male,returned home,,True,72,,False,False,...,False,True,False,False,True,True,False,5,,True
2,2,1,male,hospitalization,True,False,55,,True,False,...,False,False,False,False,False,False,False,3,False,True
3,2,1,female,returned home,,False,53,False,False,False,...,False,False,False,False,False,False,False,7,,True
4,2,1,male,returned home,,False,68,,True,False,...,False,True,False,False,False,False,False,3,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,2,13,male,returned home,,False,40,,False,False,...,False,False,False,False,False,False,False,7,,False
1048571,1,13,male,hospitalization,False,False,51,,False,False,...,False,True,False,False,False,False,False,7,False,False
1048572,2,13,male,returned home,,False,55,,False,False,...,False,False,False,False,False,False,False,7,,False
1048573,2,13,male,returned home,,False,28,,False,False,...,False,False,False,False,False,False,False,7,,False


In [5]:
df.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,...,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU,DIED
0,2,1,female,returned home,,True,65,False,False,False,...,False,True,False,False,False,False,False,3,,True
1,2,1,male,returned home,,True,72,,False,False,...,False,True,False,False,True,True,False,5,,True
2,2,1,male,hospitalization,True,False,55,,True,False,...,False,False,False,False,False,False,False,3,False,True
3,2,1,female,returned home,,False,53,False,False,False,...,False,False,False,False,False,False,False,7,,True
4,2,1,male,returned home,,False,68,,True,False,...,False,True,False,False,False,False,False,3,,False
