## Bibliotecas

In [None]:
import numpy as np
import pandas as pd
import glob
import json
import seaborn as sns
import spacy
from matplotlib import pyplot as plt

## Funções

In [None]:
#Lê os arquivos e armazena no dataframe
def load_json_to_df(json_files, df):
    
    for file_names in json_files:
        
        keywords ={'paper_id': None, 'title': None, 'abstract': None}
        
        with open(file_names) as json_data:            
        
            data = json.load(json_data)
        
            if 'paper_id' not in data: 
                keywords['paper_id'] = np.nan
            else:
                keywords['paper_id'] = data['paper_id'].strip() #Retira os espaço antes e depois 
        
           #Armazena o Titulo
            if 'metadata' not in data: 
                keywords['title'] = np.nan
            else:
                keywords['title'] = data['metadata']['title'].strip()
        
            #Armazena o Abstract
            if 'abstract' not in data: 
                keywords['abstract'] = np.nan
            else:
                abstracts = [abstract['text'] for abstract in data['abstract']]
                abstract = '\n'.join(abstracts)
                keywords['abstract'] = abstract.strip()
            
            df = df.append(keywords, ignore_index = True)
        
    return df

In [None]:
def normalize(sentence): 
    #Todo texto em minúsculo
    sentence = sentence.lower()
    list_ = []
    for word in nlp(sentence):
        
        #Retirar stopword, numeral, pontuação, espaço duplo 
        if not (word.is_stop or word.like_num or word.is_punct or word.is_space or 
                len(word) == 1):
            
            #Realiza a lematização 
            list_.append(word.lemma_)
    
    return ' '.join([str(element) for element in list_])

In [None]:
#Atrinbutos a serem coletados
papers_features = {'paper_id': [], 'title': [], 'abstract': []}

#Trasforma o dicionário em Dataframe
df = pd.DataFrame.from_dict(papers_features)

#Lê todos os arquivos .json do diretorio
json_filenames = glob.glob(f'{"C:/Users/sergi/Documents/Cord-19/document_parses/pdf_json"}//**/*.json', recursive = True)

#Salva os artigos em Dataframe
df = load_json_to_df(json_filenames, df)

df.to_csv('C:/Users/sergi/Documents/Cord-19/cord_df_reduzido.csv', index=False, header=True)

In [None]:
#Remove os nulos e duplicados do 'abstract'
df_cleared = df.dropna(subset=['abstract'])
df_cleared = df_cleared[df_cleared['abstract'] != '']
df_cleared = df_cleared.drop_duplicates(['abstract'])

## Pré-processamento do texto 

In [None]:
#tagger = Usada para pipeline completo
#parser = Parsing de dependencias (como uma palavra está conectada a outra)
#ner = Reconhecimento de entidades
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])
nlp.max_length = 200000 #performance
df_cleared['abstract'].apply(normalize)

## TOKENIZAÇÃO E PALAVRAS MAIS FREQUENTES

In [None]:
#Obtem a lista de textos
abstracts = df_cleared['abstract'].tolist()

#Cria um vocabulário de palavras, eleminado as palavras que aparecem em
#mais de 85% dos documentos e os stop word em inglês
#a principio retornou um vocabuário de 164.179 palavras, dificultando a clusterização.
#Para reduzir foi utilizado o parametro max_features, reduzindo para 4096 palavras
cv=CountVectorizer(max_df=0.70, max_features=2**12, stop_words = 'english')
word_count_vector=cv.fit_transform(abstracts)

#Lista das palavras mais frequentes
more_freq = list(cv.vocabulary_.keys())[:20]

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Ordena a matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

#Extrai os mais relevantes
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

tfidf = TfidfVectorizer(max_df=0.85, max_features=2**12,  stop_words = 'english')
vectorized = tfidf.fit_transform(abstracts)

sorted_items=sort_coo(vectorized.tocoo())

feature_names= tfidf.get_feature_names()
keywords=extract_topn_from_vector(feature_names, sorted_items, 20)

## K-Means

In [None]:
#Calcula o número ideal de clusters, baseado no método Elbow
wcss = []
for i in range(1, 20):
    print(i)
    #kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10)
    kmeans = MiniBatchKMeans(init='k-means++', n_clusters=i, max_iter=300, n_init=10, 
                         random_state=6)
    kmeans.fit(vectorized)
    wcss.append(kmeans.inertia_) #inertia = soma dos quadrados das distâncias dos pontos para o centróide

plt.plot(range(1, 20), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

#Calcula o número ideal de clusters
def optimal_number_of_clusters(wcss):
    x1, y1 = 1, wcss[0]
    x2, y2 = 20, wcss[len(wcss)-1]

    distances = []
    for i in range(len(wcss)):
        x0 = i+2
        y0 = wcss[i]
        numerator = abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1)
        denominator = np.sqrt((y2 - y1)**2 + (x2 - x1)**2)
        distances.append(numerator/denominator)
    
    return distances.index(max(distances)) + 2

print(optimal_number_of_clusters(wcss))

#Reduz para 2 dimensões para gerar o gráfico
from sklearn.decomposition import PCA
pca = PCA(n_components = 9)
X_pca = pca.fit_transform(vectorized.toarray())

### MiniBatchKMeans

In [None]:
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(init='k-means++', n_clusters=6, max_iter=300, n_init=10, 
                         random_state=6)

y_pred = kmeans.fit_predict(vectorized)
np.unique(y_pred)

palette = sns.color_palette('bright', len(set(y_pred)))
sns.scatterplot(X_pca[:,0], X_pca[:, 1], hue=y_pred, legend='full', palette=palette)
plt.title('Clustered Covid-19 Papers');
centers_on_pcs = pca.transform(kmeans.cluster_centers_)
plt.scatter(x=centers_on_pcs[:,0], y=centers_on_pcs[:,1], s=100, c="k", marker="X")

df_cleared['y'] = y_pred