# Step1. Text Cleaning

This Jupyter notebook demonstrates how to clean COVID-19 article data by using python code. Data cleaning aims to make data to be useful for analysis.

In [1]:
import zipfile
import pandas as pd
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
INPUT_PATH = '../data/search_results/'
JSON_FILE = 'search_results.json'
OUTPUT_PATH = '../data/processed_data/'
OUTPUT_FILE = 'processed_data.tsv'

## Load input data

In [3]:
# read file
json_f = json.load(open(INPUT_PATH+JSON_FILE))
# convert JSON to dictionary
tmp_dict= json.loads(json_f)
# convert dictionary to dataframe
df = pd.json_normalize(tmp_dict) 
# remove unnecessary columns from the dataframe
selected_columns = ['_id', 'URL', 'created', 'ISSN', 'container-title', 'author', 'DOI',
       'published', 'subject', 'title', 'link', 'source', 'type', 'publisher',
       'volume', 'last-updated', 'issue', 'funder']
df = df[selected_columns]
df.shape

(558964, 18)

In [None]:
# download data from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

## Cleaning text

In [12]:
def remove_white_spaces(text):
    # 1) Remove newline
    redun_lines = ["\n", chr(13)]
    for line in redun_lines:
        text = text.replace(line, " ")
    # 2) Remove >1 conseq Spaces
    text = re.sub(' +', ' ', text)
    # 3) other whitespaces* (incl 1,2?)
    text = " ".join(re.split(r"\s+", text))
    return text.strip()

In [13]:
def clear_tags(dataObj, tags_only=False):
    """
    Cleaning - remove tags, URLs, special characters
    """
    # Del Tag + Content (sub-titles):   <jats:title content-type="abstract-subheading">Purpose</jats:title>
    redun_tags = ['<jats:title>', '<title>']
    for tag in redun_tags:
        start = dataObj.find(tag[:-1])
        while start != -1:
            end = dataObj.find("</" + tag[1:-2], start)  # length 13      (excl last 2: for </tag   >
            if end != -1: dataObj = dataObj.replace(dataObj[start:end + 13], " ")
            start = dataObj.find(tag[:-1], start + 5)  # NEXT start (SKIP current - *if prev without end)

    # Del ALL Tags <....>    # redun_tags = ["<p>", "<jats:p>", "<jats:sec>", "<sec>", "<jats:italic>", "<jats:bold>", "<jats:p id=""p1"">"]
    dataObj = re.sub('<[^<]+?>', ' ', dataObj)

    # Del URLs
    re_url = 'https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|info)/' + '[a-z0-9.\-]'
    dataObj = re.sub(re_url, '', dataObj)

    if not tags_only:
        ### Del Symbols
        dataObj = re.sub('[&;]\d+;*', ' ', dataObj)  # [0-9] -> \d, [;] -> ;
        dataObj = re.sub('&[A-Z]{4}', ' ', dataObj)
        dataObj = re.sub('&\W{2,10};', ' ', dataObj)  # [\W] -> \W
        dataObj = re.sub('&#\d{2,4};', ' ', dataObj)
        redun = ["amp", ";lt", ";gt", "&lt", "&gt", ";p", "div", "&#x0D;", "ldquo", "rdquo", " ", " ", " ", "#160", "/p", ";"]
        for substr in redun:
            dataObj = dataObj.replace(substr, " ")

    return remove_white_spaces(dataObj)

In [14]:

def cleaning_text(text):
    """
        Remove stop-words
        No digits
        No word length less than 3 
        Convert to lowercase
    """

    cleantext = clear_tags(text)
    lemmatizer = WordNetLemmatizer()

    pos_family = {
        'noun': ['NN', 'NNS', 'NNP', 'NNPS'],
        'pron': ['PRP', 'PRP$', 'WP', 'WP$'],
        'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
        'adj': ['JJ', 'JJR', 'JJS'],
        'adv': ['RB', 'RBR', 'RBS', 'WRB']
    }
    
    cleantext = cleantext.replace('-', '_')
    
    stopwords = list(set(nltk.corpus.stopwords.words('english')))
    avoiding_words = ['covid_19', 'COVID_19', 'covid', 'COVID', 'covid_', 'COVID_', 
                      'coronavirus', 'CORONAVIRUS', 'coronaviruses', 'CORONAVIRUSES',
                      'SARS_COV_2','sars_cov_2', 'conclusion', 'CONCLUSION', 
                      'objective', 'OBJECTIVE', 'abstract', 'ABSTRACT', 
                      'background', 'BACKGROUND', 'author', 'AUTHOR', 
                      'disclosure', 'DISCLOSURE', 'title', 'TITLE']
    
    stopwords.extend(avoiding_words)
    
    regex = r"\b[^\d\W]+\b"
    tokens = []
    sentences = nltk.sent_tokenize(cleantext)
    for s in sentences:
        words = re.findall(regex, s)
        pairs = nltk.pos_tag(words)
        for pair in pairs:
            w = list(pair)[0] 
            tag = list(pair)[1]
        
            if w.isupper() != True:w = w.lower() 
                
            if tag in pos_family['noun']:
                w = lemmatizer.lemmatize(w, 'n')
            elif tag in pos_family['pron']: # e.g their, self, what
                w = lemmatizer.lemmatize(w)
            elif tag in pos_family['verb']: # e.g experienced, based, evaluating, trying, healthcare
                w = lemmatizer.lemmatize(w, 'v')
            elif tag in pos_family['adj']: # e.g significant, pandemic, clinical, sensitive
                w = lemmatizer.lemmatize(w, 'a')
            elif tag in pos_family['adv']: #e.g. sore, seriously, alone, nationally
                w = lemmatizer.lemmatize(w, 'r')
            
            if w in stopwords: continue       
            if w.isdigit(): continue
            if len(w) <= 3: continue

            tokens.append(w)
    cleaned_text = ' '.join(tokens)
 
    return cleaned_text

In [16]:
df['text'] = df['title'].apply(lambda x: cleaning_text(x))

# drop zero-word articles
df.reset_index()
drop_index = []
for index, row in df.iterrows():
    if len(row['text']) < 3:
        drop_index.append(index)
cleaned_df = df.drop(drop_index, inplace=False)

In [10]:
# save result
cleaned_df.to_csv(OUTPUT_PATH + OUTPUT_FILE, index=False, sep='\t')