# COVID-19 Documents Data Preprocessing

This Jupyter notebook demonstrates how to preprocess COVID-19 article data by using python code. Data preprocessing aims to make data to be useful for analysis, which contains removing duplications, non-English documents, cleaning text, and reformatting data table. 

In [1]:
import pandas as pd
import re
import json
import nltk
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
INPUT_PATH = '../Data/covid-json/'
OUTPUT_PATH = '../Data/preprocessed_data/'

## Load Data

In [3]:
# Use pandas library to convert json to dataframe

df1 = pd.read_json(INPUT_PATH+"covid_19.json")
df2 = pd.read_json(INPUT_PATH+"covid19.json")
df3 = pd.read_json(INPUT_PATH+"sars_cov_2.json")
print(df1.shape, df2.shape, df3.shape)

(464516, 73) (118093, 73) (98716, 71)


In [4]:
# Combine the dataframes

frames = [df1, df2, df3]
covid_df = pd.concat(frames)

# Remove duplicated articles by '_id'
covid_df.drop_duplicates(subset=['_id'], keep='first', inplace=True)
covid_df.shape

(547835, 73)

## Exclude non-English articles

In [8]:
def detect_language(text):
    try:
        language = detect(text)
    except:
        language = 'Error'
    return language

In [9]:
covid_df['text'] = covid_df['title'].astype(str) + ' ' + covid_df['abstract'].astype(str) + ' ' + covid_df['pubmed-abstract'].astype(str)
covid_df['language'] = covid_df['text'].apply(detect_language)
english_covid_df = covid_df.loc[covid_df['language'] == 'en']

In [14]:
english_covid_df.shape

(470574, 75)

## Cleaning text

In [19]:
def cleanhtml(raw_html):
    cleanr = re.compile(r'<[^>]+>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    cleantext = re.sub('  ', ' ', cleantext)
    return cleantext


def cleaning_text(text):
    """
        Remove stop-words
        No digits
        No word length less than 3 
        Convert to lowercase
    """
    # remove html tags
    cleantext = cleanhtml(text)
    lemmatizer = WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))

    pos_family = {
        'noun': ['NN', 'NNS', 'NNP', 'NNPS'],
        'pron': ['PRP', 'PRP$', 'WP', 'WP$'],
        'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
        'adj': ['JJ', 'JJR', 'JJS'],
        'adv': ['RB', 'RBR', 'RBS', 'WRB']
    }
    
    regex = r"\b[^\d\W]+\b"
    tokens = []
    cleantext = cleantext.replace('-', '_')
    sentences = nltk.sent_tokenize(cleantext)
    
    for s in sentences:
        words = re.findall(regex, s)
        pairs = nltk.pos_tag(words)
        for pair in pairs:
            w = list(pair)[0] 
            tag = list(pair)[1]
            if w in stopwords: continue
            if w.isdigit(): continue
            if w.isupper() != True:w = w.lower() 
            if len(w) <= 3: continue
            if tag in pos_family['noun']:
                w = lemmatizer.lemmatize(w, 'n')
            elif tag in pos_family['pron']: # e.g their, self, what
                w = lemmatizer.lemmatize(w)
            elif tag in pos_family['verb']: # e.g experienced, based, evaluating, trying, healthcare
                w = lemmatizer.lemmatize(w, 'v')
            elif tag in pos_family['adj']: # e.g significant, pandemic, clinical, sensitive
                w = lemmatizer.lemmatize(w, 'a')
            elif tag in pos_family['adv']: #e.g. sore, seriously, alone, nationally
                w = lemmatizer.lemmatize(w, 'r')
            tokens.append(w)
    cleaned_text = ' '.join(tokens)
 
    return cleaned_text

In [20]:
def removing_stopwords(text):
    """
        Remove common-words
    """
    stopwords = list(set(nltk.corpus.stopwords.words('english')))
    avoiding_words = ['covid_19', 'covid', 'COVID', 'covid_', 'COVID_', 'CORONAVIRUS', 
                      'SARS_COV_2', 'coronavirus', 'coronaviruses', 
                      'sars_cov_2', 'conclusion', 'CONCLUSION', 'objective', 'OBJECTIVE', 'ABSTRACT', 'BACKGROUND'
                      'abstract', 'background', 'AUTHOR', 'DISCLOSURE', 'author', 'disclosure', 'title', 'TITLE']
    stopwords.extend(avoiding_words)
    sentences = nltk.sent_tokenize(text)
    new_text = []
    for word in text.split(' '):
        if word not in stopwords:
            new_text.append(word)
                
    cleaned_text = ' '.join(new_text)
 
    return cleaned_text

In [21]:
# text cleaning part1

text_cleaning = lambda x: cleaning_text(x)
english_covid_df['cleaned_text'] = english_covid_df['text'].apply(text_cleaning)
english_covid_df.drop(columns=['text'], inplace=True)
english_covid_df.rename(columns={'cleaned_text':'text'}, inplace=True)

In [22]:
# text cleaning part2

text_cleaning = lambda x: removing_stopwords(x)
english_covid_df['cleaned_text'] = english_covid_df['text'].apply(text_cleaning)
english_covid_df.drop(columns=['text'], inplace=True)
english_covid_df.rename(columns={'cleaned_text':'text'}, inplace=True)

## Drop articles after text cleaning

We exclude word in the text if the word is a digit, stop-words, or special charater during the text cleaning process. Therefore, some articles remain zero-word after the text cleaning process. We drop these zero-word articles from the corpus before we apply topic modelling.

In [None]:
# drop zero-word articles
english_covid_df.reset_index()
drop_index = []
for index, row in english_covid_df.iterrows():
    if len(row['text']) < 3:
        drop_index.append(index)
preprocessed_df = english_covid_df.drop(drop_index, inplace=False)

In [28]:
# save data preprocessing results

column_list = list(preprocessed_df.columns)
column_list.remove('text')

preprocessed_df[column_list].to_csv(OUTPUT_PATH + "merged_covid_articles.tsv", sep='\t', encoding='utf-8', index=False)
preprocessed_df[['_id', 'text']].to_csv(OUTPUT_PATH + "preprocessed_data.tsv", sep='\t', encoding='utf-8', index=False)