In [None]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import scispacy
import spacy
import en_core_sci_lg

from scipy.spatial.distance import jensenshannon

import joblib

from IPython.display import HTML, display

from ipywidgets import interact, Layout, HBox, VBox, Box
import ipywidgets as widgets
from IPython.display import clear_output

from tqdm import tqdm
from os.path import isfile

import seaborn as sb
import matplotlib.pyplot as plt

In [None]:
import json

pubs = pd.read_csv('data/pubs_by_disease.csv', sep='#')
doids=pubs["doid"].unique()

df = pd.read_csv('data/dataset_gastric_cancer.csv', sep='#')

In [None]:
nlp = en_core_sci_lg.load(disable=["tagger", "parser", "ner"])
nlp.max_length = 3000000

def spacy_tokenizer(sentence):
    return [word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space or len(word)==1)]

vectorizer = joblib.load('reduced/vectorizer.csv')
#data_vectorized = joblib.load('reduced/data_vectorized.csv')
lda = joblib.load('reduced/lda.csv') 
doc_topic_dist = pd.read_csv('reduced/doc_topic_dist.csv')  



In [None]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print_top_words(lda, vectorizer, n_top_words=25)

In [None]:
#probabilidade do artigo 0 da subdoença 5516 pertencer a cada um dos topicos
def plot_article_dna(paper_id, width=20):
    t = df[df["id"] == paper_id].title.values[0]
    doc_topic_dist[df["id"] == paper_id].T.plot(kind='bar', legend=None, title=t, figsize=(width, 4))
    plt.xlabel('Topic')

def dna_tabs(paper_ids):
    k = len(paper_ids)
    outs = [widgets.Output() for i in range(k)]

    tab = widgets.Tab(children = outs)
    tab_titles = ['Paper ' + str(i+1) for i in range(k)]
    for i, t in enumerate(tab_titles):
        tab.set_title(i, t)
    display(tab)

    for i, t in enumerate(tab_titles):
        with outs[i]:
            ax = plot_article_dna(paper_ids[i])
            plt.show(ax)

In [None]:
#documentos da sub doença 0
papers=pubs[pubs["doid"]==doids[0]]

lista=[]
[lista.append(papers.iloc[i].id) for i in range(1,len(papers))]

dna_tabs(lista)

In [None]:
#documentos da sub doença 0
papers=pubs[pubs["doid"]==doids[1]]

lista=[]
[lista.append(papers.iloc[i].id) for i in range(1,len(papers))]

dna_tabs(lista)

In [None]:
#documentos da sub doença 25
papers=pubs[pubs["doid"]==doids[25]]

lista=[]
[lista.append(papers.iloc[i].id) for i in range(1,len(papers))]

dna_tabs(lista)

Agora vamos usar um modelo com algum pre tratamento, remoção de palavras muito frequentes e pouco (sabemos que pode ter resultados negativos mas decidimos testar na mesma)

In [None]:
#vectorizer = joblib.load('not_baseline/vectorizer.csv')
data_vectorized = joblib.load('baseline100/data_vectorized.csv')
lda = joblib.load('baseline100/lda.csv') 
doc_topic_dist = pd.read_csv('baseline100/doc_topic_dist.csv') 

In [None]:
print_top_words(lda, vectorizer, n_top_words=5)


In [None]:
papers=pubs[pubs["doid"]==doids[0]]

lista=[]


dna_tabs(lista)

In [None]:
papers=pubs[pubs["doid"]==doids[1]]

lista=[]
[lista.append(papers.iloc[i].id) for i in range(1,len(papers))]

dna_tabs(lista)

In [None]:
papers=pubs[pubs["doid"]==doids[25]]

lista=[]


dna_tabs(lista)

In [28]:
def pretty(doid, frequencia, indent=0):
   d=dict(enumerate(frequencia.flatten(), 0))
   print("Subdoença " + str(doid) )
   for key, value in d.items():
      if(value!=0): print('Tópico ' + str(key) + " ------ " +  str(value))
   print()

for doid in doids:
    frequencia=np.zeros(100)
    p=pubs[pubs["doid"]==doid].id
    for id in p:
        try:
            distancias=doc_topic_dist[df["id"] == id]
            max_indice=np.argmax(distancias)
            frequencia[max_indice]+=1
        except: 
            pass
    pretty(doid, frequencia)

Subdoença 4716
Tópico 2 ------ 4.0
Tópico 3 ------ 8.0
Tópico 6 ------ 1.0
Tópico 16 ------ 2.0
Tópico 19 ------ 6.0
Tópico 28 ------ 1.0
Tópico 37 ------ 10.0
Tópico 41 ------ 3.0
Tópico 42 ------ 1.0
Tópico 44 ------ 1.0
Tópico 47 ------ 1.0
Tópico 58 ------ 1.0
Tópico 59 ------ 1.0
Tópico 65 ------ 1.0
Tópico 73 ------ 1.0
Tópico 87 ------ 4.0
Tópico 89 ------ 3.0
Tópico 94 ------ 1.0

Subdoença 8025
Tópico 2 ------ 3.0
Tópico 3 ------ 10.0
Tópico 9 ------ 1.0
Tópico 16 ------ 1.0
Tópico 17 ------ 3.0
Tópico 27 ------ 1.0
Tópico 37 ------ 16.0
Tópico 38 ------ 1.0
Tópico 48 ------ 1.0
Tópico 67 ------ 2.0
Tópico 86 ------ 2.0
Tópico 87 ------ 6.0
Tópico 89 ------ 2.0
Tópico 94 ------ 1.0

Subdoença 10538
Tópico 3 ------ 11.0
Tópico 4 ------ 1.0
Tópico 8 ------ 1.0
Tópico 15 ------ 3.0
Tópico 20 ------ 2.0
Tópico 22 ------ 1.0
Tópico 27 ------ 3.0
Tópico 37 ------ 2.0
Tópico 38 ------ 2.0
Tópico 39 ------ 2.0
Tópico 44 ------ 1.0
Tópico 48 ------ 2.0
Tópico 49 ------ 2.0
Tópico 53 --