# COVID-19 Documents Data Preprocessing

This Jupyter notebook demonstrates how to preprocess COVID-19 article data by using python code. Data preprocessing aims to make data to be useful for analysis, which contains removing duplications, non-English documents, cleaning text, and reformatting data table. 

In [1]:
import pandas as pd
import re
import json
import nltk
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
import ijson
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None) is 

In [2]:
INPUT_PATH = '../coronavirus_twenty_years_of_research/search_results/'
OUTPUT_PATH = '../coronavirus_twenty_years_of_research/technical_validation/'

In [3]:
json1 = 'covid_19.json'
json2 = 'covid19.json'
json3 = 'sars_cov_2.json'

## Exclude non-English articles

In [4]:
def language_detection(text):
    try:
        language = detect(text)
    except:
        language = "error"
    return language

## Cleaning text

In [11]:
def remove_white_spaces(text):
    # 1) Remove newline, etc. (into Space)
    redun_lines = ["\n", chr(13)]
    for line in redun_lines:
        text = text.replace(line, " ")
    # 2) Remove >1 conseq Spaces
    text = re.sub(' +', ' ', text)
    # 3) other whitespaces* (incl 1,2?)
    text = " ".join(re.split(r"\s+", text))
   
    return text.strip()

In [12]:
def clear_tags(dataObj, tags_only=False):
    """
    Cleaning - remove tags, URLs, special characters
    """
    # Del Tag + Content (sub-titles):   <jats:title content-type="abstract-subheading">Purpose</jats:title>
    redun_tags = ['<jats:title>', '<title>']
    for tag in redun_tags:
        start = dataObj.find(tag[:-1])
        while start != -1:
            end = dataObj.find("</" + tag[1:-2], start)  # length 13      (excl last 2: for </tag   >
            if end != -1: dataObj = dataObj.replace(dataObj[start:end + 13], " ")
            start = dataObj.find(tag[:-1], start + 5)  # NEXT start (SKIP current - *if prev without end)

    # Del ALL Tags <....>    # redun_tags = ["<p>", "<jats:p>", "<jats:sec>", "<sec>", "<jats:italic>", "<jats:bold>", "<jats:p id=""p1"">"]
    dataObj = re.sub('<[^<]+?>', ' ', dataObj)

    # Del URLs
    re_url = 'https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|info)/' + '[a-z0-9.\-]'
    dataObj = re.sub(re_url, '', dataObj)

    if not tags_only:
        ### Del Symbols
        dataObj = re.sub('[&;]\d+;*', ' ', dataObj)  # [0-9] -> \d, [;] -> ;
        dataObj = re.sub('&[A-Z]{4}', ' ', dataObj)
        dataObj = re.sub('&\W{2,10};', ' ', dataObj)  # [\W] -> \W
        dataObj = re.sub('&#\d{2,4};', ' ', dataObj)
        redun = ["amp", ";lt", ";gt", "&lt", "&gt", ";p", "div", "&#x0D;", "ldquo", "rdquo", " ", " ", " ", "#160", "/p", ";"]
        for substr in redun:
            dataObj = dataObj.replace(substr, " ")

    return remove_white_spaces(dataObj)

In [4]:

def cleaning_text(text):
    """
        Remove stop-words
        No digits
        No word length less than 3 
        Convert to lowercase
    """

    cleantext = clear_tags(text)
    lemmatizer = WordNetLemmatizer()

    pos_family = {
        'noun': ['NN', 'NNS', 'NNP', 'NNPS'],
        'pron': ['PRP', 'PRP$', 'WP', 'WP$'],
        'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
        'adj': ['JJ', 'JJR', 'JJS'],
        'adv': ['RB', 'RBR', 'RBS', 'WRB']
    }
    
    cleantext = cleantext.replace('-', '_')
    
    stopwords = list(set(nltk.corpus.stopwords.words('english')))
    avoiding_words = ['covid_19', 'COVID_19', 'covid', 'COVID', 'covid_', 'COVID_', 
                      'coronavirus', 'CORONAVIRUS', 'coronaviruses', 'CORONAVIRUSES',
                      'SARS_COV_2','sars_cov_2', 'conclusion', 'CONCLUSION', 
                      'objective', 'OBJECTIVE', 'abstract', 'ABSTRACT', 
                      'background', 'BACKGROUND', 'author', 'AUTHOR', 
                      'disclosure', 'DISCLOSURE', 'title', 'TITLE', 
                      'study', 'STUDY', 'case', 'CASE', 'analysis', 'ANALYSIS']
    
    stopwords.extend(avoiding_words)
    
    regex = r"\b[^\d\W]+\b"
    tokens = []
    sentences = nltk.sent_tokenize(cleantext)
    for s in sentences:
        words = re.findall(regex, s)
        pairs = nltk.pos_tag(words)
        for pair in pairs:
            w = list(pair)[0] 
            tag = list(pair)[1]
            if w.isupper() != True:w = w.lower() 
            if tag in pos_family['noun']:
                w = lemmatizer.lemmatize(w, 'n')
            elif tag in pos_family['pron']: # e.g their, self, what
                w = lemmatizer.lemmatize(w)
            elif tag in pos_family['verb']: # e.g experienced, based, evaluating, trying, healthcare
                w = lemmatizer.lemmatize(w, 'v')
            elif tag in pos_family['adj']: # e.g significant, pandemic, clinical, sensitive
                w = lemmatizer.lemmatize(w, 'a')
            elif tag in pos_family['adv']: #e.g. sore, seriously, alone, nationally
                w = lemmatizer.lemmatize(w, 'r')
            if w in stopwords: continue       
            if w.isdigit(): continue
            if len(w) <= 3: continue
            tokens.append(w)
            
    cleaned_text = ' '.join(tokens)
 
    return cleaned_text

In [None]:
def read_process_large_json(filepath):
    column_list = ['_id',
    'abstract',
    'URL',
    'created',
    'ISSN',
    'container-title',
    'author',
    'DOI',
    'published',
    'subject',
    'title',
    'link',
    'source',
    'type',
    'publisher',
    'volume',
    'last-updated',
    'issue',
    'funder',
    'pubmed-abstract']
    data = []
    i = 0
    with open(filepath, 'r') as json_file:
        parser = ijson.items(json_file, 'item')
        for value in parser:
            i += 1
            if i % 20000 == 0:
                print('processed', i)
            item = {}
            for key, val in value.items():
                if key in column_list:
                    item[key] = val

            item['title'] = str(item['title'])
            item['language'] = language_detection(item['title'])

            if item['language'] == 'en':
                item['text'] = cleaning_text(item['title'])
                del item['language']
                if len(item['text']) >= 3:
                    data.append(item)
                
    df = pd.DataFrame(data)
    return df

In [None]:
df1 = read_process_large_json(INPUT_PATH+json1)
print(f'\n=== The number of records in "{json1}" : {len(df1)}')
df2 = read_process_large_json(INPUT_PATH+json2)
print(f'\n=== The number of records in "{json2}": {len(df2)}')
df3 = read_process_large_json(INPUT_PATH+json3)
print(f'\n=== The number of records in "{json3}": {len(df3)}')


frames = [df1, df2, df3]
covid_df = pd.concat(frames)
covid_df.drop_duplicates(subset=['_id'], keep='first', inplace=True)

In [19]:
# save data preprocessing results
covid_df.to_csv(OUTPUT_PATH + "merged_covid_articles.tsv", sep='\t', encoding='utf-8', index=False)