**<center style="color:#FBB03B; font-size: 24pt;">Work in progress...</center>**

**Build corpus**

<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [None]:
from joblib import Parallel, delayed
import pandas as pd
import re
from collections import defaultdict
import json
import itertools
import datetime
from gensim.corpora import Dictionary
from unidecode import unidecode
from tqdm.notebook import tqdm

In [None]:
!pip install langdetect

In [None]:
class config():
    INPUT_DIR="/kaggle/input/CORD-19-research-challenge/"
    
    META_FN='/kaggle/input/cord-19-step1-meta/meta_df.pkl'
    CORPUS_FN='/kaggle/working/corpus.pkl'

In [None]:
"""
Helper functions
"""

import pickle

def save(obj, fname):
    with open(fname, 'wb') as file:
        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)

def load(fname):
    with open(fname, 'rb') as file:
        return pickle.load(file)

#https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


In [None]:
# Update, we only use sars-cov-2, covid-19 keywords
cov_tokens = ['sars-cov-2',
              'hcov-19',
              'covid-19',
              '2019-ncov']

# Classes

## Cleaner

"NLP is 80% preprocessing." -Lev Konstantinovskiy ([Citation of citation](https://markroxor.github.io/gensim/static/notebooks/gensim_news_classification.html#topic=0&lambda=1&term=))

Clean papers text, like removing URLs, licenses...

In [None]:
class Cleaner():
    def __init__(self, is_recent=False):
        self.is_recent = is_recent
        
    re_flags = re.ASCII
    
    # https://github.com/jfilter/clean-text/blob/master/cleantext/constants.py
    URL_REGEX = re.compile(
        r"(?:^|(?<![\w\/\.]))"
        # protocol identifier
        # r"(?:(?:https?|ftp)://)"  <-- alt?
        r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))"
        # user:pass authentication
        r"(?:\S+(?::\S*)?@)?" r"(?:"
        # IP address exclusion
        # private & local networks
        r"(?!(?:10|127)(?:\.\d{1,3}){3})"
        r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
        r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
        # IP address dotted notation octets
        # excludes loopback network 0.0.0.0
        # excludes reserved space >= 224.0.0.0
        # excludes network & broadcast addresses
        # (first & last IP address of each class)
        r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
        r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
        r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
        r"|"
        # host name
        r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)"
        # domain name
        r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*"
        # TLD identifier
        r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r")"
        # port number
        r"(?::\d{2,5})?"
        # resource path
        r"(?:\/[^\)\]\}\s]*)?",
        # r"(?:$|(?![\w?!+&\/\)]))",
        # @jfilter: I removed the line above from the regex because I don't understand what it is used for, maybe it was useful?
        # But I made sure that it does not include ), ] and } in the URL.
        flags=re_flags | re.IGNORECASE,
    )

    """
    Drop license information
    """
    LICENSE1_REGEX = re.compile(r'\(\W*which\W*was\W*(?:[^)])+reviewed\)\W+', flags=re_flags)
    LICENSE2_REGEX = re.compile(r'CC-(?:CC0-?|BY-?|SA-?|NC-?|ND-?)+\s+\d?\s?\.?\d?\s+', flags=re_flags)
    LICENSE3_REGEX = re.compile(r'International license (:?It )is made available under a\s+', flags=re_flags)
    LICENSE4_REGEX = re.compile(r'author/funder\s*|(:?URL )?doi: \w+ preprint', flags=re_flags)
    LICENSE5_REGEX = re.compile(r'The copyright holder for this preprint(:? is the)?\s*\.?\s*(?:URL\s*)?', flags=re_flags)
    LICENSE6_REGEX = re.compile(r'who has granted \w+ a license to display the preprint in perpetuity\.?', flags=re_flags)
    LICENSE7_REGEX = re.compile(r'BY-(?:SA-?|NC-?|ND-?)+\s+\d?\s?\.?\d?\s+', flags=re_flags)
    LICENSE8_REGEX = re.compile(r'No reuse allowed without permission\.?', flags=re_flags)
    LICENSE9_REGEX = re.compile(r'All rights reserved\.?', flags=re_flags)

    """
    Virus, disease names
    https://en.wikipedia.org/wiki/Novel_coronavirus
    """
    vn1 = r'sars\W{,2}cov\W{,2}2' # SARS-CoV-2
    vn2 = r'hcov\W{,2}(?:20)?19'  # HCoV-2019
    vn3 = r'(?:20)?19\W{,2}ncov'  # 2019-nCoV
    # vn4= r'Novel coronavirus' # a bit risky
    VIRUS_REGEX = re.compile(fr'{vn1}|{vn2}|{vn3}', flags=re.IGNORECASE|re_flags)
    DISEASE_REGEX = re.compile(r'covid\W{,2}(?:20)?19', flags=re.IGNORECASE|re_flags)
    
    """
    Warning we drop usefull information
    TODO: Explain why
    """
    LONG_REGEX = re.compile(r'[^\s]{64,}', flags=re_flags)
    # Should became recursive
    IN_BRACKETS_REGEX = re.compile(r'\[[^\[\]]+\]', flags=re_flags)
    IN_PARENTHESES_REGEX = re.compile(r'\([^()]+\)', flags=re_flags)
    
    LATEX_BEGIN = re.compile(r'\\begin', flags=re_flags | re.IGNORECASE)
    LATEX_END = re.compile(r'\\end', flags=re_flags | re.IGNORECASE)

    MULTI_SPACE = re.compile(r' +', flags=re_flags)
    
    regex_pre = [
        (URL_REGEX, ' URL '),

        (LICENSE1_REGEX, ' '),
        (LICENSE2_REGEX, ' '),
        (LICENSE3_REGEX, ' '),
        (LICENSE4_REGEX, ' '),
        (LICENSE5_REGEX, ' '),
        (LICENSE6_REGEX, ' '),
        (LICENSE7_REGEX, ' '),
        (LICENSE8_REGEX, ' '),
        (LICENSE9_REGEX, ' '),

        (LATEX_BEGIN, ' {'),
        (LATEX_END, '} '),

        (LONG_REGEX, ' '),
        (IN_BRACKETS_REGEX, ' '),
        (IN_PARENTHESES_REGEX, ' '),
        
        (MULTI_SPACE, ' ')
    ]
    
    """
    Recursively drop all what is inside {}
    """
    regex_rec = re.compile(r'\{[^\{\}]*\}', flags=re_flags)
    
    """
    Post processing
    Drop Latex commands like: \\begin, \\end...
    Add space around / and =
        This break some chemical names
        TODO: Show examples and/or improve
    """
    LATEX_CMD = re.compile(r'\\[^\s]+', flags=re_flags)
    AND_REGEX = re.compile(r'/', flags=re_flags)
    EQUAL_REGEX = re.compile(r'=', flags=re_flags)
    regex_post = [
        (LATEX_CMD, ' '),
        (AND_REGEX, ' / '),
        (EQUAL_REGEX, ' = ')
    ]
    
    def clean(self, txt, is_bib=False):
        """
        
        """
        
        # ASCII transliterations of Unicode text
        txt = unidecode(txt)
        
        # TODO: make apply regex function
        
        # Change virus, diease name to official names only if recent publication and not bibliography title
        if self.is_recent and not is_bib:
            txt = self.VIRUS_REGEX.sub(' SARS-CoV-2 ', txt)
            txt = self.DISEASE_REGEX.sub(' COVID-19 ', txt)
        
        for regex, replace_with in self.regex_pre:
            txt = regex.sub(replace_with, txt)
        
        max_iter = 20
        while max_iter>0:
            o=txt
            txt = self.regex_rec.sub(' ', txt)
            if o==txt:
                break
            max_iter -= 1
                
        for regex, replace_with in self.regex_post:
            txt = regex.sub(replace_with, txt)
        
        return txt.strip()

## Tokenizer

Split text to sentences than words...

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

def lemmatize(word):
    """
    WordNet lemmatizer, try verb, noun, adjective andd return the smalest
    Always begin with verb.
    WARN: Slow code, we can for ex remove adjective from the list
    """
    lemmatizer = WordNetLemmatizer()
    
    word_v = lemmatizer.lemmatize(word, pos='v')
    word_n = lemmatizer.lemmatize(word, pos='n')
    word_a = lemmatizer.lemmatize(word, pos='a')
    
    return min([word_v, word_n, word_a], key=len)

"""
Commun useless words
"""
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) | ENGLISH_STOP_WORDS

# what is: ed
stop_words.update(["'s", "n't",
                   "e.g.", "e.g", "i.e.", "i.e", "etc.", "etc",
                   "co.", "ltd.",
                   "et", "al", "al.", "dr.",
                   "table", "figure", "fig", "fig.",
                   "url",
                   "http", "https"
                  ])


# BAD code rewrite
number = r'[\d\Wex]+' # fake number, but do the job
number_re = re.compile(f'^{number}s?$')
not_letters_digits_re = re.compile(r'^[^a-zA-Z0-9]+$')
single_re = re.compile(r'^[a-zA-Z]\.?$')

def valid_token(token):
    not_stop_words = not token in stop_words
    not_number = not number_re.match(token)
    not_punctuation = not not_letters_digits_re.match(token)
    not_single = not single_re.match(token)
    
    return (not_number and 
            not_stop_words and 
            not_punctuation and 
            not_single and 
            len(token)>1 and len(token)<70)


# Rewrite proprely, at the right place (tokenizer class)
a_reg = re.compile(fr"^[\W]+(.+)$")
reg = r'^(\d+[.e\d\W]+)((?!.*ncov$).+)'
b_reg = re.compile(reg)
c_reg = re.compile(r'^(.*)[-*]+$')

def post(token):
    # Drop non words characters at begining: "-+,=sometext" -> "sometext"
    token = a_reg.sub(r'\1',token)
    # "10km" -> "km", "10-years-old" -> "years-old" but keep "2019-ncov"
    token = b_reg.sub(r'\2',token)
    # "sometext-" -> "sometext", "word*" -> "word"
    token = c_reg.sub(r'\1',token)
    
    return token


## Sentence

Sentence class, words container...

In [None]:
class Sentence():
    ws_reg = re.compile(r'\s+')
    
    def __init__(self, text):
        # ~ Original text
        self.original_text = self.ws_reg.sub(' ', text)
        
        # List of str
        self._text = []
        
        # Lazy init
        self._bow = None
        # Sentence text tokens id
        self._tokensid = None
        # Unique tokens id
        self._tokensid_set = None
        
    def tokenize(self):
        # Rewrite proprely, at the right place (tokenizer class)
        self.text[:] = word_tokenize(self.original_text, preserve_line=True)
        self.text[:] = [token.lower() for token in self.text]
        self.text[:] = [post(token) for token in self.text]
        self.text[:] = [lemmatize(token) for token in self.text]
        self.text[:] = [token for token in self.text if valid_token(token)]
    
    @property
    def tokensid(self):
        if self._tokensid is None:
            self._tokensid = [tokenid for tokenid, _ in self.bow]
        return self._tokensid
    
    @property
    def tokensid_set(self):
        if self._tokensid_set is None:
            self._tokensid_set = set(self.tokensid)
        return self._tokensid_set
    
    @property
    def bow(self):
        if self._bow is None:
            self._bow = dictionary.doc2bow(self.text)
        return self._bow
    
    @property
    def text(self):
        return self._text

    @text.setter
    def text(self, text):
        self._text = text
    
    def __hash__(self):
        return hash(self.original_text)

    def __eq__(self, other):
        return self.original_text == other.original_text
    
    def __repr__(self):
        return self.original_text


## Document

Document class, sentences container...

In [None]:
class Document():
    def __init__(self, txt_blocks):
        txt_blocks = txt_blocks if isinstance(txt_blocks,list) else [txt_blocks]
        
        # Lazy init
        # Document BoW
        self._bow = None
        # Document text tokens id
        self._tokensid = None
        # Unique tokens id
        self._tokensid_set = None
        
        self.sentences = list(set([Sentence(sentence) for txt in txt_blocks for sentence in sent_tokenize(txt)]))
    
    def tokenize(self):
        for sent in self.sentences:
            sent.tokenize()
    
    """
    Commun with Sentence class, merge them.
    """
    @property
    def bow(self):
        if self._bow is None:
            self._bow = dictionary.doc2bow(self.text)
        return self._bow
    
    @property
    def tokensid(self):
        if self._tokensid is None:
            self._tokensid = [tokenid for tokenid, _ in self.bow]
        return self._tokensid
    
    @property
    def tokensid_set(self):
        if self._tokensid_set is None:
            self._tokensid_set = set(self.tokensid)
        return self._tokensid_set
    """
    End commun
    """
    
    @property
    def text(self):
        return list(itertools.chain(*[sentence.text for sentence in self]))
    
    def __iter__(self):
        return iter(self.sentences)
            
    def __len__(self):
        return len(self.sentences)


## Paper

Paper class, Document subclass plus some paper information like (title, doi, abstract)

In [None]:
# Paper ex FileReader class, they are so different now but it helps!
# https://www.kaggle.com/maksimeren/covid-19-literature-clustering
class Paper(Document):
    def __init__(self, txt_blocks=None):
        pass
    
    def get_content(self):
        with open(f'{config.INPUT_DIR}/{self.file_path}') as file:
            content = json.load(file)
            return content
    
    def add_entry(self, container, text, cleaner=None, is_bib=False, min_len=24, max_len=2e4):
        text = text.strip()
        
        n_chars = len(text)
        if n_chars>min_len and n_chars<max_len:
            if cleaner is not None:
                text = cleaner.clean(text, is_bib)
            container.add(text)
            
            return text
        return None
    
    def from_json(self, row):
        """
        Parse JSON paper
        Better to put this function away from this class
        
        Parameters
            row : Pandas Series (title, doi, abstract, publish_time, pmc_json_files)
        returns
            Paper
        """
        
        # Paper text blocks
        txt_blocks = set()
        
        self.file_path = row.pmc_json_files
        self.doi = row.doi
        
        # Epoch time
        self.publish_time = (row.publish_time - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
        
        
        # Old code, now all papers are recent
        pub_date = datetime.datetime.fromtimestamp(self.publish_time).date()
        is_recent = pub_date > datetime.date(2019, 12, 1)
        cleaner = Cleaner(is_recent)
        
        self.title = self.add_entry(txt_blocks, row.title, cleaner)
        self.abstract = self.add_entry(txt_blocks, row.abstract, cleaner)
        
        content = self.get_content()

        for entry in content['body_text']:
            self.add_entry(txt_blocks, entry['section'], cleaner) #sub title
            self.add_entry(txt_blocks, entry['text'], cleaner)

        for entry in content['ref_entries']:
            self.add_entry(txt_blocks, content['ref_entries'][entry]['text'], cleaner)

        """
        Not used now
        We keep bib entries to build a graph, and set PageRank
            We need prior paper relevance, can be combination of authority and newest
        Make it outside this class
        Remember to not change papers title use 'cleaner.clean(text, is_bib=True)'
        """
        bib_entries = set()
        for entry in content['bib_entries']:
            self.add_entry(bib_entries, content['bib_entries'][entry]['title'], cleaner, is_bib=True)
        self.bib_entries = list(bib_entries)
        
        # When debuging keep paper text...
        #self.txt_blocks = list(txt_blocks)
        
        super(Paper, self).__init__(list(txt_blocks))
        self.tokenize()
        
        return self
    
    def __repr__(self):
        return f'{self.title}:\n{self.abstract[:200]}...\n'


## Corpus

Corpus class, documents container.

In [None]:
"""
Corpus class
"""
class Corpus:
    def __init__(self, docs):
        # List of Document
        self.docs = docs
        
        """
        Lazy init
        Corpus Term frequency & Term relative frequency
        Shape [n, m]
          n = #documents
          m = #terms
        """
        self._TF = None
        self._TRF = None
    
    @property
    def TF(self):
        if self._TF is None:
            # Like in gensim:docsim:SparseMatrixSimilarity
            # But transform to dense array... explain!
            self._TF = matutils.corpus2csc([doc.bow for doc in self], num_terms=len(dictionary)).T
            self._TF = np.asarray(self._TF.todense()) # currently no-op, CSC.T is already CSR
        
        return self._TF
    
    @property
    def TRF(self):
        if self._TRF is None:
            self._TRF = self.TF/self.TF.sum(axis=1).reshape(-1,1)
        
        return self._TRF
    
    @property
    def bow(self):
        return [doc.bow for doc in self.docs]
    
    @property
    def text(self):
        return [doc.text for doc in self.docs]
    
    # https://gaopinghuang0.github.io/2018/11/17/python-slicing
    def __getitem__(self, key):
        if isinstance(key, slice):
            start, stop, step = key.indices(len(self))
            return Corpus([self.docs[i] for i in range(start, stop, step)])
        elif isinstance(key, list):
            assert len(key)>0, f'Empty list provided'
            assert type(key[0]) == int, f'Only list of integers supported'
            
            return Corpus([self.docs[i] for i in key])
        elif isinstance(key, int):
            return self.docs[key]
        else:
            raise Exception(f'Invalid argument type: {type(key)}')

    def __iter__(self):
        return iter(self.docs)
            
    def __len__(self):
        return len(self.docs)

# Meta

In [None]:
meta_df  = pd.read_pickle(config.META_FN)

In [None]:
meta_df.head()

In [None]:
len(meta_df)

## Keep recent publications

In [None]:
meta_df = meta_df.query("publish_time > '2019/12/01'")

len(meta_df)

## Keep english papers

This is only probabilistic, so we will drop some english papers and keep some non-english ones!


In [None]:
%%time

from langdetect import detect
from langdetect import detect_langs
from langdetect import DetectorFactory
DetectorFactory.seed = 0

def get_langid(txt):
    return detect(txt)

meta_df['abstract_langid'] = meta_df['abstract'].apply(get_langid)
meta_df['title_langid'] = meta_df['title'].apply(get_langid)

meta_df['abstract_langid'].value_counts()

In [None]:
meta_df = meta_df.query("(title_langid=='en') and (abstract_langid=='en')")

len(meta_df)

# Parse papers

Get papers, please note the multiprocessing backend, is one of the ways to serialize the output...

In [None]:
def get_paper(row):
    paper = Paper().from_json(row)
    
    return paper

In [None]:
def get_papers():
    with Parallel(n_jobs=-1, backend='multiprocessing') as parallel:
        dfunc = delayed(get_paper)
        res = parallel(dfunc(row) for i, row in tqdm(meta_df.iterrows(),total = len(meta_df)))
        
        return res

In [None]:
%%time

papers = get_papers()

In [None]:
len(papers)

# Clean

Drop papers with low content

In [None]:
papers[:] = [paper for paper in papers if len(paper.bib_entries) > 2 and 
             (paper.title is not None) and 
             (paper.abstract is not None) and 
             len(paper.sentences)>2]

In [None]:
len(papers)

# COVID-19 papers

In [None]:
def get_covid_papers(covid):
    for paper in tqdm(papers):
        is_covid = False
        for token in cov_tokens:
            if token in paper.text:
                is_covid = True
                break
        if covid and is_covid:
            yield paper
        elif not covid and not is_covid:
            yield paper

In [None]:
covid_papers = [paper for paper in get_covid_papers(True)]

In [None]:
len(covid_papers)

# Merge known words

Use Wordnet to merge consecutive words (until 4) into one token.  
Ex:  
- world health organization -> world_health_organization  
- severe acute respiratory syndrome -> severe_acute_respiratory_syndrome  
- health care -> health_care  
- but not this one -> but not this one  

In [None]:
from nltk.corpus import wordnet as wn

In [None]:
"""
Clean clean then clean
"""
def merge_words(paper):
    for si, sent in enumerate(paper):
        sent = sent.text

        new_sent = []
        n = len(sent)
        i = 0
        look_ahead = 4
        while i<n:
            j = min(look_ahead, max(0, n-i-1))
            while j>0:
                ngram = '_'.join(sent[i:i+j+1])
                if len(wn.synsets(ngram)):
                    # Found a match
                    break
                j-=1

            assert j>=0

            if j==0:
                new_sent.append(sent[i])
            else:
                new_sent.append(ngram)

            i += 1+j

        paper.sentences[si].text = new_sent

In [None]:
%%time
for paper in covid_papers:
    merge_words(paper)

# Save

- Build dictionary
- Filter dictionary
- Build corpus

In [None]:
dictionary = Dictionary([paper.text for paper in covid_papers])

len(dictionary)

In [None]:
dictionary.filter_extremes(no_below=20, no_above=0.8)

len(dictionary)

In [None]:
corpus = Corpus(covid_papers)
corpus.dictionary = dictionary

In [None]:
save(corpus, config.CORPUS_FN)

In [None]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')