In [1]:
import numpy as np
import pandas as pd

In [2]:
data_set_url = 'https://github.com/shunjid/cn-covid-storybook/blob/master/data-re-translation/out/covid-en-0.9.csv?raw=true'
df = pd.read_csv(filepath_or_buffer=data_set_url)
df.head(3)

Unnamed: 0,Age,Gender,Case Announcement Province,Event,Description
0,58,female,Sichuan Province,visit patient,"On January 26, 10:00, visit the patient at Shi..."
1,58,female,Sichuan Province,Visit a critically ill father,"On January 27, 16:00, visit the critically ill..."
2,58,female,Sichuan Province,Funeral,"January 31-February 5, funeral in Baohua Villa..."


# Keyword matching algorithm

In this section, we will be working with our translated dataset. This algorithm will be matching each of expected keywords to be present in a list of decription or, not and will return a binary list that contains 0/1 as per order of the description.

**Reason:** To identify and extract possible symptomps of a COVID-19 patient in the list of description.

In [3]:
def match_keywords(description_list, keywords_list):
  matched_binary_list = []
  # to convert NaN into ''
  filtered_list = ['' if x is np.nan else x for x in description_list]

  # iterate and match
  for item in filtered_list:
    lower_text = item.lower()

    if any(x in lower_text for x in keywords_list):
      matched_binary_list.append(1)
    else:
      matched_binary_list.append(0)


  return matched_binary_list

# Prepare and test NaN for our dataset

In [4]:
# find columns NaN percentage
def check_nan(_df) : 
  percent_nan = _df.isnull().sum() * 100 / len(_df)
  missing_value_in_df = pd.DataFrame(
      {
          'percent_nan': round(percent_nan, 2)
      }
  )
  print(missing_value_in_df)

In [5]:
# We have some NaN in Event
# So replacing them with ''

df.fillna(value='', inplace=True)

In [6]:
# merge description and event
df_desc_event = (df['Description'] + ' ' + df['Event']).to_frame(name='description')
df_desc_event.head()

Unnamed: 0,description
0,"On January 26, 10:00, visit the patient at Shi..."
1,"On January 27, 16:00, visit the critically ill..."
2,"January 31-February 5, funeral in Baohua Villa..."
3,"On February 9, at 20:00, a close contact with ..."
4,"On January 19th, Lei Mouying returned from Wuh..."


In [7]:
# check NaN
check_nan(df_desc_event)

             percent_nan
description          0.0


## Match keywords through algorithm

In this section, we will be matcing all possible keywords to find out the symptomps of COVID-19 patients.

In [8]:
# match fever
matched_fever_list = match_keywords(df_desc_event['description'], ['fever', 'temperature'])
len(matched_fever_list)

9888

In [9]:
df['HasFever'] = matched_fever_list
df['HasFever'].value_counts()

0    8852
1    1036
Name: HasFever, dtype: int64

In [10]:
# match cough
matched_cough_list = match_keywords(df_desc_event['description'], ['cough', 'wheez'])
len(matched_fever_list)

9888

In [11]:
df['HasCough'] = matched_cough_list
df['HasCough'].value_counts()

0    9470
1     418
Name: HasCough, dtype: int64

In [12]:
# match pneumonia
matched_pneumonia_list = match_keywords(df_desc_event['description'], ['pneumonia', 'coronary'])
len(matched_pneumonia_list)

9888

In [13]:
df['HasPneumonia'] = matched_pneumonia_list
df['HasPneumonia'].value_counts()

0    9119
1     769
Name: HasPneumonia, dtype: int64

In [14]:
# match lung infection
matched_lunginf_list = match_keywords(df_desc_event['description'], ['lung', 'lungs', 'breath'])
df['HasLungProblem'] = matched_lunginf_list
df['HasLungProblem'].value_counts()

0    9673
1     215
Name: HasLungProblem, dtype: int64

In [15]:
# match runny nose
matched_runnynose_list = match_keywords(df_desc_event['description'], ['runny nose', 'runny', 'nose'])
df['HasRunnyNose'] = matched_runnynose_list
df['HasRunnyNose'].value_counts()

0    8998
1     890
Name: HasRunnyNose, dtype: int64

In [16]:
# match muscle/pain problems
matched_muscle_problem_list = match_keywords(df_desc_event['description'], ['muscle', 'muscle soreness', 'joint', 'pain', 'headache', 'weak'])
df['HasMusclePainProblem'] = matched_muscle_problem_list
df['HasMusclePainProblem'].value_counts()

0    9786
1     102
Name: HasMusclePainProblem, dtype: int64

In [17]:
# match diarrhea problems
matched_diarrhea_problem_list = match_keywords(df_desc_event['description'], ['diarrhea', 'Diarrhea'])
df['HasDiarrhea'] = matched_diarrhea_problem_list
df['HasDiarrhea'].value_counts()

0    9859
1      29
Name: HasDiarrhea, dtype: int64

In [18]:
# match travel history
matched_travel_history_list = match_keywords(df_desc_event['description'], ['travel', 'taxi', 'bus', 'car', 'cycle', 'train', 'vehicle', 'plane', 'tour', 'tourism', 'journey', 'trip', 'move', 'relative', 'visit', 'meet', 'go '])
df['HasTravelHistory'] = matched_travel_history_list
df['HasTravelHistory'].value_counts()

0    6827
1    3061
Name: HasTravelHistory, dtype: int64

In [19]:
# match isolation
matched_isolation_list = match_keywords(df_desc_event['description'], ['isolation', 'isolated', 'observation', 'quarantine', 'observe', 'monitor', 'treatment', 'inspection', 'observed', 'sent to', 'sent ', 'send', 'suspect', 'suspected', 'appear', 'at home', 'given', 'medicine', 'contact', 'admission', 'admitted'])
df['Isolation'] = matched_isolation_list
df['Isolation'].value_counts()

0    5578
1    4310
Name: Isolation, dtype: int64

In [20]:
# match confirmed cases
matched_confirm_list = match_keywords(df_desc_event['description'], ['confirm', 'confirmed', 'positive', 'virus', 'corona', 'coronavirus', 'infected'])
df['Confirmed'] = matched_confirm_list
df['Confirmed'].value_counts()

0    7907
1    1981
Name: Confirmed, dtype: int64

In [21]:
# match dignosed
matched_diagnosed_list = match_keywords(df_desc_event['description'], ['diagnosis', 'diagnosed'])
df['Diagnosed'] = matched_diagnosed_list
df['Diagnosed'].value_counts()

0    8178
1    1710
Name: Diagnosed, dtype: int64

# Drop unnecessary columns

In [22]:
df.drop(columns=['Description', 'Event'], axis=1, inplace=True)
df.head()

Unnamed: 0,Age,Gender,Case Announcement Province,HasFever,HasCough,HasPneumonia,HasLungProblem,HasRunnyNose,HasMusclePainProblem,HasDiarrhea,HasTravelHistory,Isolation,Confirmed,Diagnosed
0,58,female,Sichuan Province,0,0,0,0,0,0,0,1,0,0,0
1,58,female,Sichuan Province,0,0,0,0,0,0,0,1,0,0,0
2,58,female,Sichuan Province,0,0,0,0,0,0,0,0,0,0,0
3,58,female,Sichuan Province,0,0,0,0,0,0,0,0,1,1,0
4,51,female,Sichuan Province,0,0,0,0,0,0,0,0,0,0,0


In [24]:
df.dtypes

Age                            int64
Gender                        object
Case Announcement Province    object
HasFever                       int64
HasCough                       int64
HasPneumonia                   int64
HasLungProblem                 int64
HasRunnyNose                   int64
HasMusclePainProblem           int64
HasDiarrhea                    int64
HasTravelHistory               int64
Isolation                      int64
Confirmed                      int64
Diagnosed                      int64
dtype: object

In [23]:
from google.colab import files

df.to_csv('preprocessed.csv')
files.download('preprocessed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>