# Covid : LDA model and a GraphDB

In the following, I explore my approach used to build an LDA model for topic detection and then ingest the dataset from metadata.csv into a GraphDB. For this purpose, I've used Neo4j which can be easily installed on every computer. I prefer this solution instead of common Python libraries since this is closer to my real job.

I'll try to update the results coming from this analysis. At the end, you will find the next and things I want to explore. 

Thare are already many notebooks on LDA and analysis on metadata file. I want to report the main source of inspiration. These are great notebooks, I suggest to read/learn (and upvote) them.

References:
* [Topic modeling finding related articles by Daniel Wolffram](https://www.kaggle.com/danielwolffram/topic-modeling-finding-related-articles)
* [Tools and Knowledge Graphs by Shahules786](https://www.kaggle.com/shahules/cord-tools-and-knowledge-graphs)
* [Topic Modeling with Gensim (web - Tutorial)](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)

N.B: this is my first notebook published on Kaggle. Any suggestion is welcome :)

### Import Packages

In [1]:
import pandas as pd
from tqdm import tqdm
from functools import reduce
import numpy as np
import time
import re

#NLP stuff
import unicodedata
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora
import gensim

# Visualization
import pyLDAvis
import matplotlib.pyplot as plt
%matplotlib inline

#Data Ingestion to Neo4j
from py2neo import Graph
from py2neo import Node,Relationship
from py2neo import NodeMatcher


### Functions

In this section I define the functions to clean the text in the following sections. 
In particular: stemmer, stopwrods, lemmatizer and tokenizer. I plan to test different Tokenizers in the future. 

In [None]:
stemmer = SnowballStemmer("english", ignore_stopwords = True)
stopWordList=stopwords.words('english')
stopWordList.remove('no')
stopWordList.remove('not')
lemma=WordNetLemmatizer()
token=ToktokTokenizer()

In [None]:
def stopWordsRemove(text):
    wordList=[x.lower().strip() for x in token.tokenize(text)]
    removedList=[x for x in wordList if not (x in stopWordList)]
    text=' '.join(removedList)
    return text
def stemWords(text):
    wordList=[x.lower().strip() for x in token.tokenize(text)]
    stemmedlist = map(lambda x: stemmer.stem(x), wordList)
    text = ' '.join(stemmedlist)
    return text
def removeAscendingChar(data):
    data=unicodedata.normalize('NFKD',data).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return data
def removeCharDigit(text):
    stringa='`~@#$%&*()[!{”;:\’><.,/?”}]0123456789'
    text = ''.join(list(map(lambda w: ' ' if w in stringa else w, text)))
    return text

### Retrieve Data

In [None]:
source_path = '../Data/'
end_path = '../Data/'
filename = 'metadata.csv'

In [None]:
df = pd.read_csv(source_path+filename)
df['key'] = df.index

### Data Processing

I had to define a key column to join later the results with the original dataset. 

In [None]:
papers = df[['title','abstract','key']]
papers.abstract = papers.abstract.str.replace('Abstract ','')
papers = papers.dropna(how='all',subset=['title'])
papers = papers.fillna(value=' ')
papers = papers.drop_duplicates(['title','abstract'])

In [None]:
columns = ['title','abstract']

for column in tqdm(columns):
    papers[column] = papers[column].apply(lambda x: removeAscendingChar(x))
    papers[column] = papers[column].apply(lambda x: removeCharDigit(str(x)))
    papers[column] = papers[column].apply(lambda x: stemWords(stopWordsRemove(str(x))))
papers['text'] = papers.title + ' ' + papers.abstract

### Build Corpus

In [None]:
def build_corpus(df,column):
    corpus = []
    for data in tqdm(df[column].dropna()):
        words = [x for x in token.tokenize(data) if not (x in stopWordList)]
        words = [lemma.lemmatize(x) for x in words if len(x) > 2 ]
        corpus.append(words)
    return corpus   
def prepare_text(text):
    words = [x for x in token.tokenize(removeAscendingChar(str(text))) if not (x in stopWordList)]
    words = [lemma.lemmatize(x) for x in words if len(x) > 2 ]
    return words

In [None]:
text_corpus = build_corpus(papers,'text')

In [None]:
dictionary = corpora.Dictionary(text_corpus)
dictionary.filter_extremes(no_below = 3, no_above = 0.99)
dictionary.save('../Model/dictionary')

Save Dictionary for later use

In [None]:
corpus = [dictionary.doc2bow(text) for text in text_corpus]
papers = pd.concat([papers,pd.Series(text_corpus,name = 'text_corpus')], axis = 1)

### Find Optimal Number of Topics

In [None]:
from gensim.models import CoherenceModel,LdaModel,HdpModel
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
def get_best_model(models, score_list, numbers):
    m = max(score_list)
    k = min([i for i,j in enumerate(score_list) if j==m])
    print("The best model is number:", k)
    return numbers[k],models[k]

In [None]:
LIMIT_NUM = 20
STEP_NUM = 3
START_NUM = 7
model_list, coherence_values = compute_coherence_values(dictionary=dictionary,
                                                        corpus=corpus,
                                                        texts=papers['text_corpus'].fillna(''), 
                                                        start=START_NUM, 
                                                        limit=LIMIT_NUM, 
                                                        step=STEP_NUM)
# Show graph
limit=LIMIT_NUM; start=START_NUM; step=STEP_NUM;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In my case, it seems that 19 is the optimal number of topics. I could increase LIMIT_NUM in the future. 

In [None]:
num_topics, lda_model = get_best_model(model_list, coherence_values,x)
lda_model.save('../Model/lda.model')

### LDA model Visualization

In [None]:
from pyLDAvis import gensim
lda_display = gensim.prepare(ldamodel3, corpus, dictionary, sort_topics=False)
pyLDAvis.show(lda_display)

### Generate Topic Features

In [None]:
def topics_sentences(model, corpus, dataset):
    """
   Function to get:
   - percentage per topic for every document in texts
   - dominant topic 
   - keywords 
    """
    contents = dataset[['title','text_corpus','key']]
    df = pd.DataFrame()
    topics_text = model[corpus]
    for i,row in tqdm(enumerate(topics_text)):
        row = sorted(row, key = lambda x: x[1], reverse = True)
        for j, (topic_num, prob_topic) in enumerate(row):
            if j==0:
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                df = df.append(
                    pd.Series([int(topic_num),round(prob_topic,4),topic_keywords]),
                    ignore_index = True)
            else:
                break
    df = pd.concat([df, contents], axis = 1).reset_index()
    df.columns = ['Document_Number','Dominant_Topic','Perc_Contribution', 'Topic_Keywords','title', 'text','key']
    df.info()
    return df

In [None]:
topic_per_document = topics_sentences(lda_model, corpus, papers)
topic_per_document.info()
topic_per_document.to_csv(end_path+'topic_per_document.csv',header = True)

In [None]:
def get_topic_distribution(model, corpus):
    columns = ['Topic_no_'+str(x) for x in range(0,num_topics)]
    topic_distr_docs = pd.DataFrame()
    for i,document in tqdm(enumerate(corpus)):
        topic_distr_docs = topic_distr_docs.append(pd.Series(list(list(
            zip(*model.get_document_topics(document, minimum_probability=0)))[1])
                                                             ,dtype = 'float64'),
                                                   ignore_index = True
                                                  )
    topic_distr_docs.columns=columns
    return topic_distr_docs

In [None]:
topic_distr_docs = get_topic_distribution(lda_model, corpus)
topic_distr_docs.info()

In [None]:
papers_fe = pd.concat([papers.dropna(subset=['title','abstract']),topic_per_document.dropna(subset=['Dominant_Topic']).drop(['title'],axis = 1),topic_distr_docs],axis = 1).dropna(subset=['text','Dominant_Topic'],how = 'any',axis = 0)
papers_fe = papers_fe.loc[:,~papers_fe.columns.duplicated()]
papers_fe.to_csv(end_path + 'papers_text.csv',header = True)

### Store Topic Infos

In [None]:
Topics_count = papers_fe['Dominant_Topic'].value_counts()
Topic_perc = Topics_count/Topics_count.sum()
Topic_Keywords = papers_fe[['Dominant_Topic','Topic_Keywords']].drop_duplicates().set_index('Dominant_Topic')
#pd.concat([Topic_Keywords,Topic_perc,Topics_count],axis = 1)
Topics = Topic_Keywords.join(pd.concat([Topics_count,Topic_perc],axis=1))
Topics.columns=['Topic_Keywords','Topic_Count','Topic_perc']
Topics.to_csv('../Data/topics.csv')

### GraphDB: Ingestion to Neoj

In [None]:
source_path = '../Data/'
end_path = '../Data/'
filename = 'metadata.csv'
df = pd.read_csv(source_path+filename)
df['key'] = df.index

In the next lines I add the information previously elaborated. 
I think there are (at least to way) two ways:
* join the data with the original dataset and store it in the DB (this is the method I've chosen at the moment but I still need to study the execution time).
* load Dictionary and the LDA model to find the features for each row in the original dataset.


In [None]:
documents = pd.read_csv('../Data/papers_text.csv',index_col = 0)
documents.info()

In [None]:
full_document = df.join(documents,on='key',how='inner',lsuffix='_orig')
full_document['authors'] = full_document['authors'].apply(lambda x: removeAscendingChar(str(x)).split(';'))
full_document.info()

In [None]:
Topics = pd.read_csv('../Data/topics.csv')
Topics

We need to declare the connection to our graphdb istance.

In [None]:
remote_graph = Graph("bolt://localhost:<YOUR-LOCAL-HOST>",user = "neo4j",password = "<YOUR-PASSWORD>")

We create the nodes for the topics with the uniqueness constrain

In [None]:
remote_graph.schema.create_uniqueness_constraint('Topic','id_topic')

for index,row in tqdm(Topics.iterrows()):
    a = Node('Topic', id_topic = row.Dominant_Topic, keywords = row.Topic_Keywords)
    a.__primarylabel__ = 'Topic'
    a.__primarykey__ = 'id_topic'
    remote_graph.create(a)

We define some constraint for the next entities

In [None]:
try:
    remote_graph.schema.create_uniqueness_constraint('Paper','title')
except:
    pass
try:
    remote_graph.schema.create_uniqueness_constraint('Author','name')
except:
    pass
try:
    remote_graph.schema.create_uniqueness_constraint('Journal','name')
except:
    pass

In [None]:
stage = full_document

cols = list(stage.columns)
r = re.compile('Topic_no.*')
topic_columns = list(filter(r.match,cols))

In [None]:
AUTHORED_BY = Relationship.type("AUTHORED_BY")
PUBLISHED_IN = Relationship.type("PUBLISHED_IN")
DOMINANT_TOPIC = Relationship.type("DOMINANT_TOPIC")

matcher = NodeMatcher(remote_graph)

for index, row in tqdm(stage.iterrows()):
    a = Node("Paper", title = row.title, doi = row.doi, abstract = row.abstract, pmcid = row.pmcid, sha = row.sha,
             license = row.license, pusblish_date = row.publish_time,
             has_full_text = row.has_full_text, full_text_file = row.full_text_file             
            )
    a.__primarylabel__ = "Paper"
    a.__primarykey__ = "title"
    for author in row.authors:
        b = Node("Author", name = author)
        b.__primarylabel__ = "Author"
        b.__primarykey__ = "name"
        remote_graph.merge(AUTHORED_BY(a, b))
    c = Node("Journal", name = row.journal, source_x = row.source_x)
    c.__primarylabel__="Journal"
    c.__primarykey__="name"
    remote_graph.merge(PUBLISHED_IN(a,c))
    scores = []
    for topic in topic_columns:
        scores.append(row[topic])
    a['topic_score'] = scores
    remote_graph.push(a)
    d = matcher.match("Topic", id_topic = row.Dominant_Topic).first()
    remote_graph.merge(DOMINANT_TOPIC(a,d))