In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import json
import os
from tqdm import tqdm,tqdm_notebook

In [None]:
# Récuperer 7500 articles depuis le dossier pdf_json

path_data = '../input/CORD-19-research-challenge/document_parses/pdf_json'
count = 0
docs = []
for file in tqdm(os.listdir(path_data)):
    file_path = f"{path_data}/{file}"
    j = json.load(open(file_path,"rb"))
    paper_id = j['paper_id']
    # minimizing the id
    paper_id = paper_id[-7:]
    title = j['metadata']['title']

    try: 
        abstract = j['abstract'][0]['text']
    except:
        abstract = ""

    full_text = ""
    bib_entries = []
    
    for txt in j['body_text']:
        full_text += txt['text']

    docs.append([paper_id, title, abstract, full_text])

    count += 1

    if (count >= 5000) :
        break


In [None]:
# Récuperer 7500 articles depuis le dossier pmc_json


path_data = '../input/CORD-19-research-challenge/document_parses/pmc_json'
count = 0
for file in tqdm(os.listdir(path_data)):
    file_path = f"{path_data}/{file}"
    j = json.load(open(file_path,"rb"))
    paper_id = j['paper_id']
    # minimizing the id
    paper_id = paper_id[-7:]
    title = j['metadata']['title']

    try: 
        abstract = j['abstract'][0]['text']
    except:
        abstract = ""

    full_text = ""
    bib_entries = []
    
    for txt in j['body_text']:
        full_text += txt['text']

    docs.append([paper_id, title, abstract, full_text])

    count += 1

    if (count >= 5000) :
        break

In [None]:
# Ici on va créer un DataFrame où on va regrouper tous ce qu'on a récuperer pour faciliter la manipulation de tout ces données

# Create dataframe containing the files we gathered 
my_data = pd.DataFrame(docs,columns=['paper_id','title','abstract','body'])
my_data.head()

In [None]:
len(my_data)

In [None]:
topic = ['covid','covid19','corona','coronavirus','corona-virus','SARS','SARSCOV2','severe acute resperatory syndrom']

labels = []

for abst in tqdm(my_data["body"]):
    if any(x in abst for x in topic):
        labels.append(1)
    else :
        labels.append(0)
        
my_data['labels'] = labels

my_data.drop(my_data.index[my_data['labels']==0], inplace = True)


len(my_data)

En ce qui suit, on va essayer d'éliminer les articles non anglais, car ces derneiers peuvent affecter notre modèle

In [None]:
# getting rid of non english articles 
!pip install langdetect
from langdetect import detect
from langdetect import DetectorFactory
DetectorFactory.seed = 0

for body in tqdm(my_data['body']):
    try:
        if detect(body) != "en":
            my_data.drop(my_data.index[my_data['body']==body], inplace = True)
    except:
        my_data.drop(my_data.index[my_data['body']==body], inplace = True)


len(my_data)


Calcule de nombre de mots des articles pour choisir les articles qui sont riches en mots

In [None]:
# Ici on va devoir analyser les données par savoir les nombre des mots dans le résumé et le body l'article

#my_data["nb_mot_abstract"] = my_data["abstract"].apply(lambda phrase: len(phrase.strip().split()))
my_data["nb_mot_body"] = my_data["body"].apply(lambda phrase: len(phrase.strip().split()))
#my_data["nb_mot_body_unique"] = my_data["body"].apply(lambda phrase: len(set(phrase.strip().split())))

my_data.head()

Dans notre étude on va se limiter seulement sur le texte de l'article qui est réferencié dans le DataFrame par "body", on garder seulement les articles dont le nombre de mots dépasse 200, sinon on considère l'article incomplet

In [None]:
# Delete rows with less than 200 words in full text
my_data.drop(my_data.index[my_data['nb_mot_body'] <= 200], inplace = True)
len(my_data)

# DATA CLEANING & PREPARATION

on va tout d'abord transformer les textes en miniscule pour rechercher sans perte de données, et aussi pour éviter la sensibilité des modèles à la casse par exemple pour un modèle d'apprenstissage automatique **Youssef != youssef**

In [None]:
my_data["body"] = my_data["body"].str.lower()

my_data.head()

Maintenant il est temps de réduire la dimensionalité de notre données, puisque avec ce nombre immense de mots, il est indispensable de se concentrer sur les mots important

In [None]:
# Reduce dimensionnality 
# with this big data (after toeknization) we'll have a lot of words, so in order to accelerate the learning of Neural Network
# We'll eliminate the unimportant words called stopwords (ex : 'the', 'is' ...)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# téléchargement les stopwords et ajouter d'autres
stopwords_custom = nltk.corpus.stopwords.words('english')

"""
Ajouter d'autres mots qui peuvent étres absents dans la liste mais qui peuvent êtres fréquement utilisés dans les 
articles scientifiques
""" 

stopwords_custom.extend(
                        ['common','review','describes','abstract','retrospective','chart','patients','study','may', 'g', 'show',
                        'associated','results','including','high','found','one','well','among','abstract','provide', 'e', 'shown',
                        'objective','background','range','features','participates','doi', 'preprint', 'copyright', 'many',
                        'org', 'https', 'et','al', 'author', 'figure', 'table', 'rights', 'reserved', 'figures', 'reported',
                        'permission', 'use', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'thu',
                        'elsevier', 'pmc', 'czi', 'editor', 'brazil', 'article', 'figures', 'tables', "the", 'a', 'all', 'thus',
                        'pubmed', 'editors', 'authors', 'methods', 'method', 'result', 'paper', 'introduction', 'editor', 
                         'although', 'letter', 'reviews', 'papers', 'tables', 'addition', 'example', 'even', 'within', 'report']
                        )



In [None]:
# élimination des ponctuations ( ? ; , "" ....)
from nltk.tokenize import RegexpTokenizer

new_data = pd.DataFrame()
tokenizer_pattern = RegexpTokenizer('\w+')
new_data['text'] = my_data['body'].apply(lambda x: " ".join(tokenizer_pattern.tokenize(x.lower())))
new_data.head()

In [None]:
# Elimination des stop words de la data,

new_data['text'] = new_data['text'].apply(lambda x: " ".join([word for word in x.split() if word not in (stopwords_custom)]))
new_data.head()

Une petite visualisation des mots les plus fréquents dans notre texte

In [None]:
from wordcloud import WordCloud

# Join the different processed titles together.
long_string = ','.join(list(new_data['text'].values))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=500, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()

à ce stade il nous reste que tokeniser les textes comme une dernière étape, et puis on peut construire notre modèle.

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
# finalement on va faire une tokenization des textes 

data = new_data['text'].values.tolist()


def tokenize_and_clean(data):
    for d in data:
        yield(gensim.utils.simple_preprocess(str(d),deacc=True))
        
words = list(tokenize_and_clean(data))

words[:1]

création des modèle de Bi-Gram et Tri-Gram 

In [None]:
bigram = gensim.models.Phrases(words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[words[0]]])

lemmatization des mots pour que le modèle comprend les différentes variations des mots

In [None]:
# Définition des fonctions utiles

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
import spacy

# Céation des BiGrams

bigrams = make_bigrams(words)

nlp = spacy.load('en', disable=['parser','ner'])
nlp.max_length = 10000000

# lemmatization 

lemmatized = lemmatization(bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

lemmatized[:1]

# BUILDING THE MODEL

In [None]:
from gensim.corpora import Dictionary

id2word = corpora.Dictionary(lemmatized)

texts = lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

corpus[:1]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis