## Install missing packages

In [None]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz

## Import packages

In [None]:
import pandas as pd
import unicodedata as ud
import fasttext
import re
import scispacy
import spacy
import requests

from typing import Dict
from itertools import chain
from tqdm.notebook import tqdm
from string import punctuation
from html import unescape
from ftfy import fix_text
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from Levenshtein import distance

tqdm.pandas()

## Introduction
The purpose of the notebook is to clean the CORD-19 dataset in such a way that it can be utilized by any English-based pre-trained model. The work here is the result of [Evidence Prime](https://evidenceprime.com/) team. We work on the project called [LASER](https://evidenceprime.com/laser/), which aims to track scientific literature and synthesize the knowledge about the given topic. We share our experience & knowledge in the notebook, hoping that it will inspire the community to fight COVID-19.

## Read data

In [None]:
df = pd.read_csv(
    filepath_or_buffer='/kaggle/input/CORD-19-research-challenge/metadata.csv', 
    encoding='utf-8',
    delimiter=',',
)

df.fillna({'title': '', 'abstract': ''}, inplace=True)
print(f'Number of studies: {len(df)}')

## Clean data

### Remove empty records
Some records might miss titles and abstracts, but they have full texts. Such records would be discarded anyway in the first stage of screening, hence I remove them.

In [None]:
empty_mask = (df['title'].str.strip() == '') & (df['abstract'].str.strip() == '')
print(f'Number of empty studies: {sum(empty_mask)}')

df = df.loc[~empty_mask]
print(f'Number of studies: {len(df)}')

### Unescape HTML chars
- I perform "double" unescaping to transform `&amp;lt;` into `<`
- I want to get rid of `&nbsp;` first to combine `text` column properly from `title` & `abstract`. Otherwise, I might not detect all empty strings. I need such detection in the `text` column concatenation algorithm.

In [None]:
df['title'] = df['title'].progress_apply(
    lambda title: unescape(unescape(title))
)

df['abstract'] = df['abstract'].progress_apply(
    lambda abstract: unescape(unescape(abstract))
)

### Remove non-english studies
I decided to remove non-English studies because pre-trained models are mostly language-specific. Although TF-IDF algorithm can be considered an exception, take in mind that the tokenization algorithm still depends on the language. Empty text is mapped into `en` language using the FastText model.

In [None]:
model = fasttext.load_model('/kaggle/input/language-identification/lid.176.bin')

In [None]:
df['title_lang'] = df['title'].progress_apply(
    lambda title: model.predict(title.lower(), k=1)[0][0].split('__label__')[1]
)

df['title_lang'].value_counts().head(10)

In [None]:
df['abstract_lang'] = df['abstract'].progress_apply(
    lambda abstract: model.predict(abstract.lower(), k=1)[0][0].split('__label__')[1]
)

df['abstract_lang'].value_counts().head(10)

Some titles and abstracts have different languages. I decided to keep a study if at least one of them contains English text.

In [None]:
title_en_mask = df['title_lang'] == 'en'
df.loc[~title_en_mask, 'title'] = ''
print(f'Number of non-english titles: {sum(~title_en_mask)}')

abstract_en_mask = df['abstract_lang'] == 'en'
df.loc[~abstract_en_mask, 'abstract'] = ''
print(f'Number of non-english abstracts: {sum(~abstract_en_mask)}')

### Concat `title` and `abstract` columns into `text` column
Some papers (e.g. [SWIFT-Review: a text-mining workbench for systematic review](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4877757/)) treat the columns separately.  I decided to process further a single-column `text` for code clarity. 

In [None]:
def append_dot(text: str) -> str:
    return text if re.search(r'[.?!]$', text) or text == '' else text + '.'

df['title'] = df['title'].str.strip().progress_apply(append_dot)
df['abstract'] = df['abstract'].str.strip().progress_apply(append_dot)

Some abstracts start with `Abstract`. I decided to remove such a prefix.

In [None]:
abstract_mask = df['abstract'].str.startswith('Abstract')
print(f"There are {sum(abstract_mask)} abstracts that start with word `abstract`.")

df.loc[abstract_mask, 'abstract'] = (df
    .loc[abstract_mask, 'abstract']
    .progress_apply(lambda text: text[8:])
    .str.strip()
)

Finally, let's create the `text` column and drop non-English texts.

In [None]:
df['text'] = (df['title'] + ' ' + df['abstract']).str.strip()

In [None]:
empty_mask = df['text'] == ''
print(f'Number of empty studies: {sum(empty_mask)}')

df = df.loc[~empty_mask]
print(f'Number of studies: {len(df)}')

### Find HTML tags
The text contains HTML tags that are useful for people because they support text with formatting information. As far as I know, current language models don't utilize such knowledge, thus I just extract text from tags.

In [None]:
html_mask = df['text'].progress_apply(
    lambda text: bool(BeautifulSoup(text, "html.parser").find())
)

print(f'Found {sum(html_mask)} studies with HTML tags.')

### Remove HTML tags

In [None]:
df['text'] = df['text'].progress_apply(
    lambda text: BeautifulSoup(text, "html.parser").get_text()
)

### Find hyperlinks
The text contains hyperlinks that bring no information to pre-trained language models. In a worst-case scenario such hyperlinks might introduce out-of-distribution errors to pre-trained model, thus I decided to remove them from the text.

In [None]:
hyperlink_pattern = re.compile('http://\S+|https://\S+')

In [None]:
hyperlink_mask = df['text'].progress_apply(
    lambda text: bool(re.search(hyperlink_pattern, text))
)

print(f'Found {sum(hyperlink_mask)} studies with hyperlinks.')

### Remove hyperlinks

In [None]:
df['text'] = df['text'].progress_apply(
    lambda text: re.sub(hyperlink_pattern, '', text)
)

### Normalize encoding
Studies are crawled / downloaded from multiple databases and likely processed by many tools like EndNote or Covidence. As the result contains a lot of [mojibakes](https://en.wikipedia.org/wiki/Mojibake). I used a `ftfy` package to fix some of them.


In [None]:
df['text'] = df['text'].progress_apply(lambda text: fix_text(text, normalization='NFKC'))

I suspect that it doesn't fix all the cases (this shows my experience), therefore I decided to fix manually some of them. I found decoding tables online to help with this task. I crawl & use only the first one.
* https://www.i18nqa.com/debug/utf8-debug.html
* http://string-functions.com/encodingindex.aspx

In [None]:
response = requests.get('https://www.i18nqa.com/debug/utf8-debug.html')
response.encoding = 'UTF-8'  # By default is ISO-8859-1 (aka. Latin1) -> See RFC2854
if response.status_code != 200:
    print("Couldn't download a webpage source.")

In [None]:
soup = BeautifulSoup(
    markup=response.text, 
    features='html.parser'
)

table = soup.find("table", attrs={"id": "dbg"})

mapping = []
for row in table.findAll("tr"):
    cells = row.findAll("td", attrs={"class": "ch"})
    if cells:
        if cells[1].text != '' and cells[0].text != '':
            mapping.append((cells[1].text, cells[0].text))
        
        if cells[3].text != '' and cells[2].text != '':
            mapping.append((cells[3].text, cells[2].text))
            
df_mapping = pd.DataFrame(mapping, columns=['key', 'value'])
df_mapping.head()

I also don't trust the table fully. Sometimes I found in the text [Angstrom unit](https://en.wikipedia.org/wiki/Angstrom) that according to the table it should be mapped into `Š`. Therefore I drop the keys with single char after striping the text.

In [None]:
df_mapping = df_mapping[df_mapping['key'].str.strip().apply(len) > 1]

Is it a valid mapping?

In [None]:
df_mapping_agg = df_mapping.groupby('key')['value'].apply(list)
df_mapping_agg[df_mapping_agg.apply(len) > 1]

Yes! Let's make an encoding mapper then!

In [None]:
df_mapping['key'] = df_mapping['key'].apply(lambda key: ud.normalize('NFKC', key))
df_mapping['value'] = df_mapping['value'].apply(lambda value: ud.normalize('NFKC', value))

encoding_mapping = dict(zip(df_mapping['key'], df_mapping['value']))

### Find encoding table issues

In [None]:
encoding_pattern = '|'.join(encoding_mapping.keys())
encoding_mask = df['text'].str.contains(encoding_pattern)

print(f'Found {sum(encoding_mask)} studies with encoding issues.')

### Fix encoding table issues

In [None]:
def replace_multiple(text: str, mapping: Dict[str, str]) -> str:
    for key in mapping:
        text = text.replace(key, mapping[key])
    return text

In [None]:
df['text'] = df['text'].progress_apply(lambda text: replace_multiple(text, encoding_mapping))

### Find non-unicode chars
I couldn't identify all broken chars. Let's investigate non-unicode chars and fix them.

In [None]:
def is_nonunicode_char(char):
    try:
        ud.name(char)
    except ValueError:
        return True
    
    return False

In [None]:
nonunicode_chars = df['text'].progress_apply(
    lambda text: [char for char in text if is_nonunicode_char(char)]
)

nonunicode_chars_counts = pd.Series(chain(*nonunicode_chars)).value_counts()

In [None]:
nonunicode_chars_counts

### Investigate non-unicode chars

In [None]:
with pd.option_context('display.max_colwidth', -1):    
    char_mask = df['text'].str.contains('')
    display(df.loc[char_mask, 'text'])

### Replace non-unicode chars

In [None]:
unicode_mapping = {
    '': '-',
    '': '',
    '': '=',
    '': '=',
    '': ''
}

In [None]:
df['text'] = df['text'].progress_apply(lambda text: replace_multiple(text, unicode_mapping))

### Find non-ascii chars
The Unicode standard is too rich for people to use. As the result, people use different chars for the same thing (e.g. `α`, `ɑ`). I believe that deep learning models can learn such meaning, but char simplification makes it easier for sure. This way we could also identify some encoding issues that couldn't be fixed previously.

In [None]:
def is_nonascii_char(char: str) -> bool:
    try:
        char.encode('ascii')
    except ValueError:
        return True
    
    return False

In [None]:
def get_nonascii_df(df: pd.DataFrame) -> pd.DataFrame:
    nonascii_chars = df['text'].progress_apply(
        lambda text: [
            (char, ud.category(char), ord(char)) 
            for char in text 
            if is_nonascii_char(char)
        ]
    )

    df_nonascii_chars = pd.DataFrame(
        data=chain(*nonascii_chars), 
        columns=['char', 'category', 'ord']
    )
    
    return (df_nonascii_chars
        .groupby('category')['char']
        .apply(set)
    )

I decided to first fix some studies with Chinese chars from `Lo` Unicode category.

In [None]:
df.loc[10993, 'text'] = """Traditional usages, botany, phytochemistry, pharmacology and toxicology of Polygonum multiflorum Thunb.: A review. Ethnopharmacological relevance Polygonum multiflorum Thunb., which is known as Heshouwu in China. It is traditionally valued and reported for hair-blacking, liver and kidney-tonifying and anti-aging effects as well as low toxicity. The aim of this review is to provide comprehensive information on the botany, traditional uses, phytochemistry, pharmacological research and toxicology of Polygonum multiflorum, based on the scientific literature. Moreover, trends and perspectives for future investigation of this plant are discussed. It will build up a new foundation for further study on Polygonum multiflorum. Materials and methods A systematic review of the literature on Polygonum multiflorum was performed using several resources, including classic books on Chinese herbal medicine and various scientific databases, such as PubMed, SciFinder, the Web of Science, Science Direct, China Knowledge Resource Integrated (CNKI). Results Polygonum multiflorum is widely distributed throughout the world and has been used as a traditional medicine for centuries in China. The ethnomedical uses of Polygonum multiflorum have been recorded in many provinces of China and Japan for nine species of adulterants in six families. More than 100 chemical compounds have been isolated from this plant, and the major components have been determined to be stilbenes, quinones, flavonoids and others. Crude extracts and pure compounds of this plant are used as effective agents in pre-clinical and clinical practice due to their anti-aging, anti-hyperlipidaemia, anti-cancer and anti-inflammatory effects and to promote immunomodulation, neuroprotection, and the curing of other diseases. However, these extracts can also lead to hepatotoxicity, nephrotoxicity and embryonic toxicity. Pharmacokinetic studies have demonstrated that the main components of Polygonum multiflorum, such as 2,3,5,4′-tetrahydroxystilbene-2-O-β-d-glucopyranoside and emodin are distributed among many organs and tissues. Conclusion Therapeutic potential of Polygonum multiflorum has been demonstrated in the conditions like Alzheimer׳s disease, Parkinson׳s disease, hyperlipidaemia, inflammation and cancer, which is attributed to the presence of various stilbenes, quinones, flavonoids, phospholipids and other compounds in the drug. On the other hand, the adverse effects (hepatotoxicity, nephrotoxicity, and embryonic toxicity) of this plant were caused by the quinones, such as emodin and rhein. Thus more pharmacological and toxicological mechanisms on main active compounds are necessary to be explored, especially the combined anthraquinones (Emodin-8-O-β-d-glucopyranoside, Physcion-8-O-β-d-glucopyranoside, etc.) and the variety of stilbenes."""
df.loc[14871, 'text'] = """YouTube as source of information on 2019 novel coronavirus outbreak: A cross sectional study of English and Mandarin content. Background The current 2019 novel coronavirus outbreak is rapidly evolving. YouTube has been recognized as a popular source of information in previous disease outbreaks. We analyzed the content on YouTube about n-CoV in English and Mandarin languages. Methods YouTube was searched using the terms '2019 novel coronavirus', 'Wuhan virus' on 1st and 2nd February 2020. First 50 videos in each group were analyzed. Videos in other languages, duplicate videos, those without an audio and duration >15 min were excluded .72 videos in English and 42 in Mandarin were reviewed. 2 reviewers classified the videos as useful, misleading or news based on pre specified criterion. Inter-observer agreement was evaluated with kappa coefficient. Modified DISCERN index for reliability and medical information and content index (MICI) score were used for content analysis. Results These videos attracted cumulative 21,288,856 views. 67% of English and 50% Mandarin videos had useful information. The viewership of misleading Mandarin videos was higher than the useful ones. WHO accounted for only 4% of useful videos. Mean DISCERN score for reliability was 3.12/5 and 3.25/5 for English and Mandarin videos respectively. Mean cumulative MICI score of useful videos was low (6.71/25 for English and 6.28/25 for Mandarin). Conclusions YouTube viewership during 2019 n-CoV outbreak is higher than previous outbreaks. The medical content of videos is suboptimal International health agencies are underrepresented. Given its popularity, YouTube should be considered as important platform for information dissemination."""
df.loc[15931, 'text'] = """Traditional Chinese medicine herbal extracts of Cibotium barometz, Gentiana scabra, Dioscorea batatas, Cassia tora, and Taxillus chinensis inhibit SARS-CoV replication. Development of anti-severe acute respiratory syndrome associated coronavirus (SARS-CoV) agents is pivotal to prevent the reemergence of the life-threatening disease, SARS. In this study, more than 200 extracts from Chinese medicinal herbs were evaluated for anti-SARS-CoV activities using a cell-based assay that measured SARS-CoV-induced cytopathogenic effect (CPE) in vitro on Vero E6 cells. Six herbal extracts, one each from Gentianae Radix (lóng dǎn; the dried rhizome of Gentiana scabra), Dioscoreae Rhizoma (shān yào; the tuber of Dioscorea batatas), Cassiae Semen (jué míng zǐ; the dried seed of Cassia tora) and Loranthi Ramus (sāng jì shēng; the dried stem, with leaf of Taxillus chinensis) (designated as GSH, DBM, CTH and TCH, respectively), and two from Rhizoma Cibotii (gǒu jǐ; the dried rhizome of Cibotium barometz) (designated as CBE and CBM), were found to be potent inhibitors of SARS-CoV at concentrations between 25 and 200μg/ml. The concentrations of the six extracts needed to inhibit 50% of Vero E6 cell proliferation (CC50) and 50% of viral replication (EC50) were determined. The resulting selective index values (SI=CC50/EC50) of the most effective extracts CBE, GSH, DBM, CTH and TCH were>59.4,> 57.5,> 62.1,> 59.4, and>92.9, respectively. Among these extracts, CBM and DBM also showed significant inhibition of SARS-CoV 3CL protease activity with IC50 values of 39μg/ml and 44μg/ml, respectively. Our findings suggest that these six herbal extracts may have potential as candidates for future development of anti-SARS therapeutics."""
df.loc[15932, 'text'] = """Antiviral Decoction of Isatidis Radix (bǎn lán gēn) Inhibited Influenza Virus Adsorption on MDCK Cells by Cytoprotective Activity. The aim of this study is to elucidate how the Isatidis Radix (bǎn lán gēn) tonic, as an aqueous mixture of hundreds of compositions, interrupts the infection of influenza viruses to their host cells. The efficacy of the tonic was evaluated and expressed as cell proliferation rate and plaque reduction rate in Madin-Darby Canine Kidney (MDCK) cells, against 3 strains of influenza A and B viruses. This boiling water (at 100°C) extract of Isatidis Radix (RIE) showed antiviral activity against influenza virus A and B. The concentration for 50% inhibition of influenza virus A replication (IC50) in MDCK cell was 12.6mg/mL with a therapeutic index >8. When cells were incubated with RIE prior to virus adsorption, the numbers of viable cell were at least doubled compared to the numbers of virus control, RIE incubation after virus adsorption and RIE incubation with virus prior to adsorption, in both influenza virus A and B. Moreover, much less virus particles were spotted by scanning electron microscope (SEM) in the RIE pre-treated cells than the cells without RIE treatment. These results indicate the antiviral activity of RIE is mainly attributed to its host cell protection effect but not actions on virus or post-virus-adsorption interruption. Cell, but not virus, is more likely to be the action target of RIE."""
df.loc[20310, 'text'] = """Antiviral Decoction of Isatidis Radix (bǎn lán gēn) Inhibited Influenza Virus Adsorption on MDCK Cells by Cytoprotective Activity. The aim of this study is to elucidate how the Isatidis Radix (bǎn lán gēn) tonic, as an aqueous mixture of hundreds of compositions, interrupts the infection of influenza viruses to their host cells. The efficacy of the tonic was evaluated and expressed as cell proliferation rate and plaque reduction rate in Madin-Darby Canine Kidney (MDCK) cells, against 3 strains of influenza A and B viruses. This boiling water (at 100°C) extract of Isatidis Radix (RIE) showed antiviral activity against influenza virus A and B. The concentration for 50% inhibition of influenza virus A replication (IC(50)) in MDCK cell was 12.6 mg/mL with a therapeutic index >8. When cells were incubated with RIE prior to virus adsorption, the numbers of viable cell were at least doubled compared to the numbers of virus control, RIE incubation after virus adsorption and RIE incubation with virus prior to adsorption, in both influenza virus A and B. Moreover, much less virus particles were spotted by scanning electron microscope (SEM) in the RIE pre-treated cells than the cells without RIE treatment. These results indicate the antiviral activity of RIE is mainly attributed to its host cell protection effect but not actions on virus or post-virus-adsorption interruption. Cell, but not virus, is more likely to be the action target of RIE."""
df.loc[20309, 'text'] = """Traditional Chinese medicine herbal extracts of Cibotium barometz, Gentiana scabra, Dioscorea batatas, Cassia tora, and Taxillus chinensis inhibit SARS-CoV replication. Development of anti-severe acute respiratory syndrome associated coronavirus (SARS-CoV) agents is pivotal to prevent the reemergence of the life-threatening disease, SARS. In this study, more than 200 extracts from Chinese medicinal herbs were evaluated for anti-SARS-CoV activities using a cell-based assay that measured SARS-CoV-induced cytopathogenic effect (CPE) in vitro on Vero E6 cells. Six herbal extracts, one each from Gentianae Radix (lóng dǎn; the dried rhizome of Gentiana scabra), Dioscoreae Rhizoma (shān yào; the tuber of Dioscorea batatas), Cassiae Semen (jué míng zǐ; the dried seed of Cassia tora) and Loranthi Ramus (sāng jì shēng; the dried stem, with leaf of Taxillus chinensis) (designated as GSH, DBM, CTH and TCH, respectively), and two from Rhizoma Cibotii (gǒu jǐ; the dried rhizome of Cibotium barometz) (designated as CBE and CBM), were found to be potent inhibitors of SARS-CoV at concentrations between 25 and 200 μg/ml. The concentrations of the six extracts needed to inhibit 50% of Vero E6 cell proliferation (CC(50)) and 50% of viral replication (EC(50)) were determined. The resulting selective index values (SI = CC(50)/EC(50)) of the most effective extracts CBE, GSH, DBM, CTH and TCH were > 59.4, > 57.5, > 62.1, > 59.4, and > 92.9, respectively. Among these extracts, CBM and DBM also showed significant inhibition of SARS-CoV 3CL protease activity with IC(50) values of 39 μg/ml and 44 μg/ml, respectively. Our findings suggest that these six herbal extracts may have potential as candidates for future development of anti-SARS therapeutics. Abbreviations SARS, severe acute respiratory syndrome CoV, coronavirus CPE, cytopathogenic effect TCM, traditional Chinese medicine."""
df.loc[25814, 'text'] = """Suffering a Loss Is Good Fortune: Myth or Reality? We sometimes decide to take an offered option that results in apparent loss (e.g., unpaid overtime). Mainstream decision theory does not predict or explain this as a choice we want to make, whereas such a choice has long been described and highly regarded by the traditional Chinese dogma - suffering a loss is good fortune. To explore what makes the dogma work, we developed a celebrity anecdote‐based scale to measure "Chikui" (suffering a loss) likelihood and found that:(i) people with higher scores on the Chikui Likelihood Scale (CLS) were more likely to report higher scores on subjective well‐being and the Socioeconomic Index for the present and (ii) the current Socioeconomic Index could be positively predicted not only by current CLS scores but also by retrospective CLS scores recalled for the past, and the predictive effect was enhanced with increasing time intervals. Our findings suggest that "suffering a loss is good fortune" is not a myth but a certain reality. © 2017 The Authors Journal of Behavioral Decision Making Published by John Wiley & Sons Ltd."""
df.loc[43373, 'text'] = """Performance of radiologists in differentiating COVID-19 from viral pneumonia on chest CT. Background Despite its high sensitivity in diagnosing COVID-19 in a screening population, chest CT appearances of COVID 19 pneumonia are thought to be non-specific. Purpose To assess the performance of United States (U.S.) and Chinese radiologists in differentiating COVID-19 from viral pneumonia on chest CT. Methods A total of 219 patients with both positive COVID-19 by RT-PCR and abnormal chest CT findings were retrospectively identified from 7 Chinese hospitals in Hunan Providence, China from January 6 to February 20, 2020. A total of 205 patients with positive Respiratory Pathogen Panel for viral pneumonia and CT findings consistent with or highly suspicious for pneumonia by original radiology interpretation within 7 days of each other were identified from Rhode Island Hospital in Providence, RI. Three Chinese radiologists blindly reviewed all chest CTs (n=424) to differentiate COVID-19 from viral pneumonia. A sample of 58 age-matched cases was randomly selected and evaluated by 4 U.S. radiologists in a similar fashion. Different CT features were recorded and compared between the two groups. Results For all chest CTs, three Chinese radiologists correctly differentiated COVID-19 from non-COVID-19 pneumonia 83% (350/424), 80% (338/424), and 60% (255/424) of the time, respectively. The seven radiologists had sensitivities of 80%, 67%, 97%, 93%, 83%, 73% and 70% and specificities of 100%, 93%, 7%, 100%, 93%, 93%, 100%. Compared to non-COVID-19 pneumonia, COVID-19 pneumonia was more likely to have a peripheral distribution (80% vs. 57%, p<0.001), ground-glass opacity (91% vs. 68%, p<0.001), fine reticular opacity (56% vs. 22%, p<0.001), and vascular thickening (59% vs. 22%, p<0.001), but less likely to have a central+peripheral distribution (14.% vs. 35%, p<0.001), pleural effusion (4.1 vs. 39%, p<0.001) and lymphadenopathy (2.7% vs. 10.2%, p<0.001). Conclusion Radiologists in China and the United States distinguished COVID-19 from viral pneumonia on chest CT with high specificity but moderate sensitivity. A translation of this abstract in Farsi is available in the supplement."""
df.loc[43519, 'text'] = """Expert consensus on Pulmonary Function Testing during the epidemic of Corona Virus Disease 2019. Corona virus disease 2019 (COVID-19) is mainly transmitted by respiratory droplets and close contact. Pulmonary function testing procedures have been associated with an increasing risk of COVID-19 transmission among patients/subjects and medical staffs. Effective prevention and control strategies must be compulsorily implemented to prevent nosocomial infection. This recommendation is intended to be followed by healthcare workers (HCWs) of pulmonary function testing laboratory when COVID-19 is in epidemic. Based on the features of pulmonary function testing, precaution principles and strategies are developed in three aspects of management for HCWs, operating procedure, environment and equipment. Indications of pulmonary function testing should be followed strictly. It is strongly recommended to suspend the test for the confirmed or suspected cases of COVID-19 during the contagious stage, and to postpone the test for other patients if it is not imperative. Medical personnel should mandatorily adhere to the standard stratification of precaution measures. Patients/Subjects should be isolated in a separate area for testing. Disposable in-line filters must be used during pulmonary function testing. Cleaning and disinfection procedures for environment and equipment in pulmonary function testing laboratory should be paid more attention."""
df.loc[43523, 'text'] = """Cause analysis and treatment strategies of "recurrence" with novel coronavirus pneumonia (covid-19) patients after discharge from hospital. With a large number of COVID-19 patients discharging from hospital, some had showed re-fever and positive nucleic acid test after discharge from hospital. This might be due to the biological characteristics of 2019-nCoV, and might also be related to the basic disease, clinical status, glucocorticoid using, sample sampling, processing and detecting of patients, and some even related to the re-infection or secondary bacterial virus infection. Therefore, we suggest that in view of this phenomenon, further stratified management of discharge from hospital should be carried out on the basis of guidelines, especially for patients with advanced age, underlying diseases or severe or critical pneumonia. For those patients who can\'t completely deoxygenate for a long time after hospitalization, individualized treatment methods and different discharge evaluation criteria should be adopted to ensure the complete cure of patients and prevent recurrencing after discharge from hospital."""
df.loc[43798, 'text'] = """Analysis of clinical features of 153 patients with novel coronavirus pneumonia in Chongqing. Objective To analyze the clinical data of 153 patients with novel coronavirus pneumonia (COVID-19) in chongqing ,and provide reference and thinking for the diagnosis and treatment. Methods Analyze the clinical data, laboratory examination and chest imaging characteristics of 153 COVID-19 patients in Chongqing Public Health Medical Center from January 26 to February 5, 2020. According to the relevant diagnostic criteria ,patients were divided into non-severe group(n=132) and severe group(n=21),and analyze the correlation between serum index changes and disease severity. Results Combined with diabetes and chronic respiratory diseases, the severity of the disease was statistically significant ( χ 2 =11.04 and 6.94, P <0.05). No symptoms were found in patients with mild illness ( χ 2 =4.09, P <0.05) .The proportion of fever and muscle soreness in the severe group was higher than that in the non-severe group ( χ 2 =4.40 and 22.67, P <0.05).Among the concomitant symptoms, the proportion of cough and shortness of breath in the severe group was higher than that in the non-severe group ( χ 2 =8.46 and 4.80, P <0.05).C-reactive protein and d-dimer were higher in the severe group than in the non-severe group ( t =43.44 and 37.13, P <0.05), and the number of CD 3 + T lymphocyte cells, CD 4 + T lymphocyte cells and CD 8 + T lymphocyte cells in the severe group was lower than that in the non-severe group (Z=27.25, 20.60 and 17.36, P <0.05).Compared with the non-severe group, both lungs and the right lung lower lobe were more susceptible to involved ( χ 2 =6.95 and 20.39, P <0.05) . Conclusion Severity of COVID-19 was associated with underlying disease, symptoms, site of involvement, C-reactive protein, d-dimer, CD 3 + T lymphocyte count, CD 4 + T lymphocyte count, and CD 8 + T lymphocyte count."""
df.loc[43896, 'text'] = """First case of severe childhood novel coronavirus pneumonia in China. Summary One patient with a complaint of "intermittent diarrhea, vomiting for 6 days, fever with shortness of breath for half a day" was referred to the Department of Critical Medicine, Wuhan Children\'s Hospital, and was diagnosed with neonatal severe coronavirus pneumonia (NCP). Relevant databases such as China Knowledge Net, Weipu, Wanfang and other related databases were searched with the keywords of "new coronavirus pneumonia", "children", and "critical severity" as of February 8, 2020. This case is the first child with severe NCP in China. It started with gastrointestinal symptoms, early respiratory symptoms were not obvious, and rapidly progressed to acute respiratory distress syndrome, septic shock, and acute renal failure. The patient was negative for nucleic acid test of 2019 new-type coronavirus (2019-nCoV) for 2 consecutive consecutive throat swabs. For severe suspected cases, it is recommended to take samples of the lower respiratory tract or repeat samples of the upper respiratory tract for testing. Continuous blood purification technology can be applied to the treatment of children with severe NCP as early as possible."""
df.loc[46814, 'text'] = """Some Chinese folk prescriptions for wind-cold type common cold. Although self-limiting, the common cold (gǎn mào) is highly prevalent. There are no effective antivirals to cure the common cold and few effective measures to prevent it, However, for thousands years, Chinese people have treated the common cold with natural herbs, According to the traditional Chinese medicine (TCM) theory (zhōng yī lǐ lùn), the common cold is considered as an exterior syndrome, which can be further divided into the wind-cold type (fēng hán xíng), the wind-heat type (fēng rè xíng), and the summer heat dampness type (shǔ rè xíng). Since the most common type of common cold caught in winter and spring is the wind-cold type, the article introduced some Chinese folk prescriptions for the wind-cold type common cold with normal and weak physique, respectively. For thousands of years, Chinese folk prescriptions for the common cold, as complementary and alternative medicine (CAM; bǔ chōng yǔ tì dài yī xué), have been proven to be effective, convenient, cheap, and most importantly, safe. The Chinese folk prescriptions (zhōng guó mín jiān chǔ fāng) for the wind-cold type common cold are quite suitable for general practitioners or patients with the wind-cold type common cold, to treat the disease. Of course, their pharmacological features and mechanisms of action need to be further studied."""
df.loc[45438, 'text'] = """Anti-inflammatory and Antimicrobial Effects of Heat-clearing Chinese Herbs: A Current Review. ABSTRACT Inflammation is a normal immune response; but if the body's regulation of inflammation is dysfunctional, then it will have an adverse effect on the body. Although use of modern drugs for inflammation has a relieving effect, it is still unsatisfactory. Moreover, the emergence of drug-resistant strains and even new kinds of microorganisms is causing significant morbidity and mortality. Recently, more attention has been focused on herbal medicine to treat various diseases because of the ability of the herbs to affect multiple target signaling pathways and their multiple mechanisms of action. Thus, a large number of studies have reported on the anti-inflammatory and antimicrobial effects of the traditional Chinese herbs. Literature survey was performed by conducting systematic electronic search in PubMed, Science Direct, Google Scholar, and in books. This review has listed 11 heat-clearing Chinese herbs (HCCHs) including Scutellaria baicalensis (Huáng Qín), Coptis chinensis (Huáng Lián), Flos Lonicerae (Jīn Yín Hūa), Forsythia suspensa (Lián Qiào), Isatidis Folium (Dà Qīn Yè), Radix Isatidis (Bǎn Lán Gēn), Viola yedoensis (Zǐ Huā Dì Dīn), Pulsatilla Radix (Bái Tóu Wēn), Andrographis paniculata (Chuān Xīn Lián), Houttuynia cordata (Yú Xīng Cǎo), and Patrinia Herba (Bài Jiàn Cǎo), which have anti-inflammatory and antimicrobial effects, and has described their effects through different mechanisms of action and multiple targets. Their ability to affect multiple target signaling pathways and their potential mechanisms of action contributing to their anti-inflammatory and antimicrobial activity may be related to their action of removing heat and counteracting toxicity. Further studies are needed on the collection of HCCHs to know the detailed mechanism of action of herbs in this group for the assessment of effective drug."""

In [None]:
df_nonascii_chars = get_nonascii_df(df)

with pd.option_context('display.max_colwidth', -1):
    display(df_nonascii_chars)

### Investigate non-ascii chars

In [None]:
with pd.option_context('display.max_colwidth', -1):    
    char_mask = df['text'].str.contains('≓')
    display(df.loc[char_mask, 'text'])

### Replace non-ascii chars

In [None]:
ascii_mapping = {
    # Cf
    '\xad': '',
    '\u200b': '',   
        
    # Ll    
    'ĺ': 'l', 
    'ç': 'c',
    'χ': 'x', 
    'ǧ': 'g', 
    'е': 'e', 
    'ń': 'n', 
    'è': 'e',
    'ē': 'e', 
    'ӧ': 'o', 
    'ò': 'o', 
    'ü': 'u', 
    'ș': 's', 
    'ț': 't', 
    'ñ': 'n', 
    'ū': 'u', 
    'ú': 'u', 
    'ã': 'a', 
    'â': 'a', 
    'ê': 'e', 
    'í': 'i', 
    'ć': 'c', 
    'ο': 'o', 
    'ő': 'o', 
    'ō': 'o', 
    'ς': 'c', 
    'ī': 'i', 
    'ë': 'e', 
    'ǐ': 'i', 
    'ϊ': 'i', 
    'ā': 'a', 
    'ù': 'u', 
    'ǔ': 'u', 
    'é': 'e', 
    'ý': 'y', 
    'ï': 'i', 
    'ǒ': 'o', 
    'ó': 'o', 
    'ǎ': 'a', 
    'î': 'i', 
    'œ': 'oe', 
    'ł': 'l',
    'ă': 'a', 
    'ţ': 't', 
    'ı': 'i', 
    'ś': 's', 
    'ö': 'o', 
    'à': 'a', 
    'á': 'a', 
    'ì': 'i', 
    'ş': 's', 
    'ä': 'a', 
    'æ': 'ae', 
    'ô': 'o',
    # Exceptions
    'ĸ': 'κ',
    'к': 'κ',
    'ɛ': 'ε',
    'ɑ': 'α',
    
    # Lm
    'ʹ': "'", 
    'ˈ': "'",
    
    # Lu
    'С': 'C', 
    'Á': 'A', 
    'Η': 'H', 
    'Ó': 'O', 
    'Æ': 'AE', 
    'Χ': 'X', 
    'Ö': 'O', 
    'Ş': 'S', 
    'Ț': 'T', 
    'Ι': 'I', 
    'Α': 'A', 
    'À': 'A', 
    'Ç': 'C', 
    'Ä': 'A', 
    'Μ': 'M', 
    'Ü': 'U', 
    'Λ': '^', 
    'Ý': 'Y', 
    'Î': 'I', 
    'É': 'E', 
    'Ï': 'I', 
    'Œ': 'OE', 
    'È': 'E', 
    'Ν': 'N', 
    'İ': 'I', 
    'Ș': 'S',
    # Exceptions
    'Σ': '∑',
    'Ф': 'Φ',
    
    # Mn
    '́': '"', 
    '̄': '', 
    '̇': '', 
    '̈': '', 
    '̊': '', 
    '̱': '', 
    '̶': '-',
    
    # Pd
    '―': '-',
    '–': '-', 
    '‒': '-', 
    '‐': '-', 
    '—': '-', 
    
    # Pf
    '»': '',
    
    # Pf
    '«': '',
     
    # Po
    '′': "'",
    '׳': "'",
    '·': '.',
    '§': '',
    '†': '',
    '⁎': '*',
    '‡': '',
    '¶': '',
    '•': '-',
    
    # Sk    
    '˂': '<',
                                                                                                                              
    # Sm
    '⪢': '>=',
    '∶': ':',
    '⧹': '\\', 
    '⋍': '≈', 
    '±': '+/-', 
    '≥': '>=', 
    '⋯': '-', 
    '∕': '/', 
    '≦': '<=', 
    '∝': '∞', 
    '∣': '|', 
    '⩾': '>=', 
    '≠': '!=', 
    '∙': '*', 
    '≤': '<=', 
    '−': '-', 
    '×': '*', 
    '≪': '<<', 
    '⁄': '/', 
    '⩽': '<=', 
    '≫': '>>', 
    '∗': '*', 
    '⋅': '', 
    '∖': '\\', 
    '≧': '>=',
    # Exceptions
    '∆': 'Δ',

    # So
    '▾': ' ',
    '□': '',
    '●': '-',
    '␣': ' ',
    '▀': '-',
    '☐': 'r',
    '▸': '-',
    '☆': '',
    '®': '',   
    '©': '',
    '≓': '"',
    # Exceptions:
    '⇌': '↔',
    '△': 'Δ',
    '⍺': 'α',
}

In [None]:
df['text'] = df['text'].progress_apply(lambda text: replace_multiple(text, ascii_mapping))

Let's take a look again for non-ascii characters.

In [None]:
df_nonascii_chars = get_nonascii_df(df)

with pd.option_context('display.max_colwidth', -1):
    display(df_nonascii_chars)

## Simplify text
Simple models like TF-IDF work best if the work upon lemmatized stemmed text with no stop words [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law). Such a simplified text form will be also useful for duplicate detection.

In [None]:
nlp = spacy.load('en_core_sci_sm')
stemmer = PorterStemmer()

punctation_mapping = dict(zip(
    punctuation, [''] * len(punctuation)
))

In [None]:
def simplify_text(text: str) -> str:
    # Remove stop words & puncation & lemmatize & stem
    tokens = [
        stemmer.stem(token.lemma_.lower())
        for token in nlp(text) 
        if not (token.is_stop or token.is_punct)
    ]

    # Prepare text 
    text = ' '.join(tokens)    
    
    # Remove all punctation chars 
    text = replace_multiple(text, punctation_mapping)
    
    # Remove multiple spaces & strip text
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

In [None]:
df['text_simplified'] = df['text'].progress_apply(simplify_text)

## Remove duplicates
Studies are downloaded from multiple databases. As a result, a great deal of "duplicates" exist. Duplicates are not easy to detect, because often there is no 1-1 relation between them. The text might contain typos or additional information. We would need fuzzy matching to detect duplicates then!

In the literature, there is a concept of studification and duplicate detection. What is the difference between those? Every study has its own life i.e. it evolves over time. Some journals have length requirements, thus they have to be truncated (e.g. some endpoints need to be removed). In addition, population size might get bigger and bigger or intermediate results are published. No one would wait 50 years to publish just one study. Furthermore, some authors might be appended or dropped. This is what we call studification. Taking all this into consideration we can define that two studies are duplicates if two studies are the same and were published the same year. What does mean the word 'same'? Well, we don't know until we go deeper into the article. We would need to extract information from the publication. What should we do then?

In the metadata we don't have correct publication years, thus we just need to rely on plain text. I decided on "safe" fuzzy matching with Levensthein distance. I sorted the list to avoid $n^2$ complexity by looking only on adjacent texts. I need to be careful not to drop a very important study, thus I aim for low threshold.

In [None]:
df_sorted = df.sort_values(by='text_simplified').reset_index(drop=True)

### Calculate Levensthein distances

In [None]:
prev_text = ''

for idx, text in tqdm(df_sorted['text_simplified'].iteritems(), total=len(df_sorted)):
    if prev_text == '' and (text == '' or text != ''):
        dist = 999
    else:
        dist = distance(prev_text, text)
    
    df_sorted.loc[idx, 'distance'] = dist
    prev_text = text

### Find distance threshold

In [None]:
df_sorted.loc[df_sorted['distance'] <= 2000, 'distance'].hist(bins=100)

In [None]:
df_sorted.loc[df_sorted['distance'] <= 100, 'distance'].hist(bins=100)

In [None]:
df_sorted.loc[df_sorted['distance'] <= 20, 'distance'].hist(bins=20)

In [None]:
print(f'Number of studies: {len(df_sorted)}')

In [None]:
df_sorted = df_sorted.loc[df_sorted['distance'] > 3]
print(f'Number of studies after duplicates removal: {len(df_sorted)}')

## Save results

In [None]:
df_sorted.to_csv('/kaggle/working/cord_metadata_cleaned.csv', index=False)