In [50]:
import pandas as pd
import glob
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer

In [2]:
local_path = r'./covidclinicaldata-master/data/'

filenames = glob.glob(local_path + "/*.csv")

dfs = [pd.read_csv(filename) for filename in filenames]

In [3]:
df = pd.concat(dfs, ignore_index=True)

In [4]:
df.set_index(pd.DatetimeIndex(df['batch_date']), inplace = True)

df.drop(columns='batch_date', inplace=True)

# Confirm.
df.head()

Unnamed: 0_level_0,test_name,swab_type,covid19_test_results,age,high_risk_exposure_occupation,high_risk_interactions,diabetes,chd,htn,cancer,...,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat,cxr_findings,cxr_impression,cxr_label,cxr_link,er_referral
batch_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-16,"SARS-CoV-2, NAA",Nasal,Negative,15,False,False,False,False,False,False,...,False,False,False,False,False,,,,,False
2020-06-16,SARS COV2 NAAT,Nasopharyngeal,Negative,20,False,False,False,False,False,False,...,False,False,False,False,False,,,,,False
2020-06-16,"SARS-CoV-2, NAA",Nasal,Negative,14,False,,False,False,False,False,...,False,False,False,False,False,,,,,False
2020-06-16,"SARS-CoV-2, NAA",Nasopharyngeal,Negative,32,False,False,False,False,False,False,...,False,False,False,False,False,,,,,False
2020-06-16,"SARS-CoV-2, NAA",Nasal,Negative,29,False,True,False,False,False,False,...,False,False,False,False,False,,,,,False


In [5]:
df.sort_index(inplace=True)

In [6]:
df.columns

Index(['test_name', 'swab_type', 'covid19_test_results', 'age',
       'high_risk_exposure_occupation', 'high_risk_interactions', 'diabetes',
       'chd', 'htn', 'cancer', 'asthma', 'copd', 'autoimmune_dis', 'smoker',
       'temperature', 'pulse', 'sys', 'dia', 'rr', 'sats', 'rapid_flu_results',
       'rapid_strep_results', 'ctab', 'labored_respiration', 'rhonchi',
       'wheezes', 'days_since_symptom_onset', 'cough', 'cough_severity',
       'fever', 'sob', 'sob_severity', 'diarrhea', 'fatigue', 'headache',
       'loss_of_smell', 'loss_of_taste', 'runny_nose', 'muscle_sore',
       'sore_throat', 'cxr_findings', 'cxr_impression', 'cxr_label',
       'cxr_link', 'er_referral'],
      dtype='object')

In [7]:
df.shape

(93995, 45)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 93995 entries, 2020-04-07 to 2020-10-20
Data columns (total 45 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   test_name                      93995 non-null  object 
 1   swab_type                      93995 non-null  object 
 2   covid19_test_results           93995 non-null  object 
 3   age                            93995 non-null  int64  
 4   high_risk_exposure_occupation  93826 non-null  object 
 5   high_risk_interactions         69168 non-null  object 
 6   diabetes                       93995 non-null  bool   
 7   chd                            93995 non-null  bool   
 8   htn                            93995 non-null  bool   
 9   cancer                         93995 non-null  bool   
 10  asthma                         93995 non-null  bool   
 11  copd                           93995 non-null  bool   
 12  autoimmune_dis               

In [9]:
df['covid19_test_results'].value_counts()

Negative    92682
Positive     1313
Name: covid19_test_results, dtype: int64

In [10]:
# df =df[df['covid19_test_results'] == 'Positive']
# df.head()

In [11]:
df.isnull().sum()

test_name                            0
swab_type                            0
covid19_test_results                 0
age                                  0
high_risk_exposure_occupation      169
high_risk_interactions           24827
diabetes                             0
chd                                  0
htn                                  0
cancer                               0
asthma                               0
copd                                 0
autoimmune_dis                       0
smoker                               0
temperature                      46453
pulse                            45716
sys                              47472
dia                              47472
rr                               52547
sats                             46460
rapid_flu_results                93741
rapid_strep_results              93604
ctab                             58528
labored_respiration              45248
rhonchi                          70651
wheezes                  

In [12]:
df['temperature'].describe()

count    47542.000000
mean        36.794765
std          0.289229
min         33.500000
25%         36.650000
50%         36.800000
75%         36.950000
max         39.600000
Name: temperature, dtype: float64

In [13]:
df['temperature']= df['temperature'].fillna(36.900000)

In [14]:
df['pulse'].describe()

count    48279.000000
mean        76.933677
std         13.238270
min         35.000000
25%         68.000000
50%         76.000000
75%         85.000000
max        160.000000
Name: pulse, dtype: float64

In [15]:
df['pulse'] = df['pulse'].fillna(80.000000)

In [16]:
df['sys'].describe()

count    46523.000000
mean       123.149045
std         16.113033
min         50.000000
25%        112.000000
50%        121.000000
75%        132.000000
max        235.000000
Name: sys, dtype: float64

In [17]:
df['sys'] = df['sys'].fillna(123.000000)

In [18]:
df['dia'].describe()

count    46523.000000
mean        78.297079
std          9.464313
min         15.000000
25%         72.000000
50%         78.000000
75%         84.000000
max        150.000000
Name: dia, dtype: float64

In [19]:
df['dia'] = df['dia'].fillna(79.000000)

In [20]:
df['rr'].describe()

count    41448.000000
mean        14.710601
std          1.971628
min          0.000000
25%         13.000000
50%         15.000000
75%         16.000000
max         40.000000
Name: rr, dtype: float64

In [21]:
df['rr'] =df['rr'].fillna(16.000000)

In [22]:
df['sats'].describe()

count    47535.000000
mean        98.238224
std          1.423986
min         55.000000
25%         97.000000
50%         98.000000
75%         99.000000
max        100.000000
Name: sats, dtype: float64

In [23]:
df['sats'] = df['sats'].fillna(98.000000)

In [24]:
df['days_since_symptom_onset'].describe()

count    15865.000000
mean         7.063914
std         17.232417
min          1.000000
25%          2.000000
50%          3.000000
75%          7.000000
max        300.000000
Name: days_since_symptom_onset, dtype: float64

In [25]:
# def fill_int_na_with_median:
#     df.describe()
    

In [26]:
df['days_since_symptom_onset'] = df['days_since_symptom_onset'].fillna(3.000000)

In [27]:
df.drop(columns=['cxr_findings', 'cxr_impression', 'cxr_label', 'cxr_link', 'sob_severity', 'rapid_flu_results', 'rapid_strep_results', 'er_referral', 'cough_severity'], inplace=True)

In [28]:
df.isnull().sum()

test_name                            0
swab_type                            0
covid19_test_results                 0
age                                  0
high_risk_exposure_occupation      169
high_risk_interactions           24827
diabetes                             0
chd                                  0
htn                                  0
cancer                               0
asthma                               0
copd                                 0
autoimmune_dis                       0
smoker                               0
temperature                          0
pulse                                0
sys                                  0
dia                                  0
rr                                   0
sats                                 0
ctab                             58528
labored_respiration              45248
rhonchi                          70651
wheezes                          66507
days_since_symptom_onset             0
cough                    

In [29]:
df['test_name'].value_counts()

SARS-CoV-2, NAA                                           48932
Rapid COVID-19 PCR Test                                   26805
SARS COV2 NAAT                                             6365
COVID-19 PCR External Result                               5998
Rapid COVID-19 Test                                        3714
COVID-19 Vitagene At Home Test Kit                          839
SARS-CoV-2 (COVID-19) by NAAT/RT-PCR/TMA, Unknown Risk      590
SARS CoV w/CoV 2 RNA                                        396
SARS COV 2 RNA RTPCR                                        354
COVID-19 PCR Test (Curative)                                  1
SARS-CoV-2 (COVID-19) by NAAT/RT-PCR/TMA, High Risk           1
Name: test_name, dtype: int64

In [30]:
df.dropna(subset=['high_risk_exposure_occupation', 'sob', 'diarrhea', 'fatigue', 'headache', 'loss_of_smell', 'loss_of_taste', 'runny_nose', 'muscle_sore', 'sore_throat', 'cough'], inplace=True)

In [31]:
df.isnull().sum()

test_name                            0
swab_type                            0
covid19_test_results                 0
age                                  0
high_risk_exposure_occupation        0
high_risk_interactions           24656
diabetes                             0
chd                                  0
htn                                  0
cancer                               0
asthma                               0
copd                                 0
autoimmune_dis                       0
smoker                               0
temperature                          0
pulse                                0
sys                                  0
dia                                  0
rr                                   0
sats                                 0
ctab                             58336
labored_respiration              45121
rhonchi                          70453
wheezes                          66311
days_since_symptom_onset             0
cough                    

In [52]:
df['temperature'].value_counts()

36.90    50183
36.80     4739
36.85     4112
36.75     3973
37.00     3789
         ...  
39.40        1
38.75        1
34.30        1
39.50        1
34.65        1
Name: temperature, Length: 86, dtype: int64

In [32]:
df.isnull()

Unnamed: 0_level_0,test_name,swab_type,covid19_test_results,age,high_risk_exposure_occupation,high_risk_interactions,diabetes,chd,htn,cancer,...,fever,sob,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat
batch_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-07,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-04-07,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2020-04-07,False,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2020-04-07,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-04-07,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-20,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-10-20,False,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2020-10-20,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-10-20,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [51]:
onehotencoder = OneHotEncoder() 
  
df = np.array(ColumnTransformer.fit_transform(df), dtype = np.boolean) 

TypeError: fit_transform() missing 1 required positional argument: 'X'

In [33]:
for i in range(len(df.index)) :
    print("Nan in row ", i , " : " ,  df.iloc[i].isnull().sum())
#https://thispointer.com/python-pandas-count-number-of-nan-or-missing-values-in-dataframe-also-row-column-wise/

Nan in row  0  :  0
Nan in row  1  :  1
Nan in row  2  :  2
Nan in row  3  :  1
Nan in row  4  :  1
Nan in row  5  :  1
Nan in row  6  :  1
Nan in row  7  :  5
Nan in row  8  :  0
Nan in row  9  :  2
Nan in row  10  :  0
Nan in row  11  :  2
Nan in row  12  :  2
Nan in row  13  :  5
Nan in row  14  :  0
Nan in row  15  :  2
Nan in row  16  :  0
Nan in row  17  :  2
Nan in row  18  :  1
Nan in row  19  :  0
Nan in row  20  :  2
Nan in row  21  :  2
Nan in row  22  :  5
Nan in row  23  :  2
Nan in row  24  :  2
Nan in row  25  :  4
Nan in row  26  :  2
Nan in row  27  :  2
Nan in row  28  :  1
Nan in row  29  :  1
Nan in row  30  :  0
Nan in row  31  :  6
Nan in row  32  :  0
Nan in row  33  :  0
Nan in row  34  :  2
Nan in row  35  :  2
Nan in row  36  :  2
Nan in row  37  :  2
Nan in row  38  :  0
Nan in row  39  :  1
Nan in row  40  :  1
Nan in row  41  :  1
Nan in row  42  :  1
Nan in row  43  :  2
Nan in row  44  :  1
Nan in row  45  :  5
Nan in row  46  :  2
Nan in row  47  :  2
Na

In [39]:
# def find_nulls(df):
#     for i in range(len(df.index)):
#         return df.iloc[i].isnull().sum()

# find_nulls(df)

0

In [40]:
for i in range(len(df.index)):
    nulls = df.iloc[i].isnull().sum()
  

df['total_nulls'] = nulls


In [42]:
nulls

6

In [41]:
df['total_nulls'].value_counts()

6    93785
Name: total_nulls, dtype: int64

In [49]:
df = df[df['total_nulls'] < 4]

In [50]:
df.shape

(0, 37)

In [56]:
def change_temp(df):
    df['fever'] = []
    for i in range(len(df['temperature'])):
        if i > 37:
            i.append(df['fever'] == True)
        else:
            i.append(df['fever'] == False)
    return df['fever']
change_temp(df)           

ValueError: Length of values (0) does not match length of index (93785)

In [38]:
df.isnull().sum()

test_name                            0
swab_type                            0
covid19_test_results                 0
age                                  0
high_risk_exposure_occupation        0
high_risk_interactions           24656
diabetes                             0
chd                                  0
htn                                  0
cancer                               0
asthma                               0
copd                                 0
autoimmune_dis                       0
smoker                               0
temperature                          0
pulse                                0
sys                                  0
dia                                  0
rr                                   0
sats                                 0
ctab                             58336
labored_respiration              45121
rhonchi                          70453
wheezes                          66311
days_since_symptom_onset             0
cough                    

In [39]:
# def count_nan(x):
#     count = []
#     nulls = df.isna().any(axis=1)
#     nulls.append(count)
#     return count
    
# count_nan(df)

In [40]:
df[df.isna().any(axis=1)]

Unnamed: 0_level_0,test_name,swab_type,covid19_test_results,age,high_risk_exposure_occupation,high_risk_interactions,diabetes,chd,htn,cancer,...,sob,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat,total_nulls
batch_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-07,SARS CoV w/CoV 2 RNA,Nasopharyngeal,Negative,62,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,6
2020-04-07,SARS COV 2 RNA RTPCR,Nasopharyngeal,Negative,35,False,,False,False,False,False,...,False,False,False,False,False,False,False,False,False,6
2020-04-07,SARS COV 2 RNA RTPCR,Nasopharyngeal,Negative,31,False,,False,False,False,False,...,True,False,False,False,False,False,False,False,False,6
2020-04-07,SARS COV 2 RNA RTPCR,Nasopharyngeal,Negative,75,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,6
2020-04-07,SARS COV 2 RNA RTPCR,Nasopharyngeal,Negative,53,False,False,False,False,True,False,...,True,False,False,True,False,False,False,True,True,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-20,"SARS-CoV-2, NAA",Nasal,Negative,41,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,6
2020-10-20,Rapid COVID-19 PCR Test,Nasal,Negative,58,False,,False,False,False,False,...,False,False,False,False,False,False,False,False,False,6
2020-10-20,"SARS-CoV-2, NAA",Nasal,Negative,37,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,6
2020-10-20,"SARS-CoV-2, NAA",Nasal,Negative,38,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,6


In [41]:
df['high_risk_interactions'].value_counts()

False    54928
True     14201
Name: high_risk_interactions, dtype: int64

In [42]:
df['labored_respiration'].value_counts()

False    48276
True       388
Name: labored_respiration, dtype: int64

In [43]:
df['rhonchi'].value_counts()

False    16248
True      7084
Name: rhonchi, dtype: int64

In [44]:
df['wheezes'].value_counts()

False    17942
True      9532
Name: wheezes, dtype: int64

In [45]:
# df['cough_severity'].value_counts()

In [46]:
df['fever'].value_counts()

False    69012
True      2023
Name: fever, dtype: int64

In [47]:
# df['er_referral'].value_counts()

In [48]:
#sns.heatmap(data=df, annot=True)