<a href="https://colab.research.google.com/github/srpantano/mestrado/blob/master/Cord19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BLIBLIOTECAS

In [0]:
import numpy as np
import pandas as pd
import glob
import json
import seaborn as sns
import spacy
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA

# FUNÇÕES

In [0]:
def load_json_to_df(json_files, df):
    
    for file_names in json_files:
        
        #keywords ={'paper_id': None, 'title': None, 'abstract': None}
        keywords ={'paper_id': None, 'abstract': None}
        
        with open(file_names) as json_data:            
        
            data = json.load(json_data)
        
            if 'paper_id' not in data: 
                keywords['paper_id'] = np.nan
            else:
                keywords['paper_id'] = data['paper_id'].strip() #Retira os espaço antes e depois 
        
           #Armazena o Titulo
            '''if 'metadata' not in data: 
                keywords['title'] = np.nan
            else:
                keywords['title'] = data['metadata']['title'].strip()'''
        
            #Armazena o Abstract
            if 'abstract' not in data: 
                keywords['abstract'] = np.nan
            else:
                abstracts = [abstract['text'] for abstract in data['abstract']]
                abstract = '\n'.join(abstracts)
                keywords['abstract'] = abstract.strip()
            
            df = df.append(keywords, ignore_index = True)
        
    return df

In [0]:
def normalize(sentence): 
    #Todo texto em minúsculo
    sentence = sentence.lower()
    list_ = []
    for word in nlp(sentence):
        
        #Retirar stopword, numeral, pontuação, espaço duplo 
        if not (word.is_stop or word.like_num or word.is_punct or word.is_space or 
                len(word) == 1):
            
            #Realiza a lematização 
            list_.append(word.lemma_)
            
    list_ = list( dict.fromkeys(list_) )
            
    return ' '.join([str(element) for element in list_])

In [0]:
#Atrinbutos a serem coletados
papers_features = {'paper_id': [], 'abstract': []}

In [0]:
#Trasforma o dicionário em Dataframe
df = pd.DataFrame.from_dict(papers_features)

In [0]:
#Lê todos os arquivos .json do diretorio
json_filenames = glob.glob(f'{"C:/Users/sergi/Documents/Cord-19/document_parses"}//**/*.json', recursive = True)

In [0]:
#Salva os artigos em Dataframe
df = load_json_to_df(json_filenames, df)

In [0]:
#df = pd.read_csv('C:/Users/sergi/Documents/Cord-19/cord_df_reduzido.csv')

#df.to_csv('C:/Users/sergi/Documents/Cord-19/cord_df_reduzido.csv', index=False, header=True)

In [0]:
#Plota heatmap de nulos
sns.heatmap(df.notnull(), cmap="Blues");

# Pré-processamento da base

In [0]:
df_cleared = df.dropna(subset=['abstract'])
df_cleared = df_cleared[df_cleared['abstract'] != '']
df_cleared = df_cleared.drop_duplicates(['abstract'])

In [0]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])
df_cleared = df_cleared.assign(abstract=df_cleared['abstract'].apply(normalize))

# Análise da base de Abstracts

In [0]:
df_cleared['word_count'] = df_cleared['abstract'].apply(lambda x: len(str(x).split(" ")))
print("Mediana de quantidade de palavras no Abstract por artigo: " + str(df_cleared['word_count'].median()))
print(df_cleared['word_count'].describe())

In [0]:
x = df_cleared['word_count']
n_bins = 20
plt.hist(x, bins=n_bins)
plt.xlabel('Número de palavras no Abstract')
plt.ylabel('Frequência')
plt.show()

boxplot = df_cleared.boxplot(column=['word_count'])

df_wc700 = df_cleared.loc[(df_cleared['word_count'] > 10) & (df_cleared['word_count'] < 400)]
print("Mediana de quantidade de palavras no Abstract por artigo: " + str(df_wc700['word_count'].median()))
print(df_wc700['word_count'].describe())

# TOKENIZAÇÃO E PALAVRAS MAIS FREQUENTES

In [0]:
#Obtem a lista de textos
abstracts = df_wc700['abstract'].tolist()

In [0]:
#Cria um vocabulário de palavras, eleminado as palavras que aparecem em
#mais de 85% dos documentos e os stop word em inglês
#a principio retornou um vocabuário de 164.179 palavras, dificultando a clusterização.
#Para reduzir foi utilizado o parametro max_features, reduzindo para 4096 palavras
#cv=CountVectorizer(max_df=0.50, max_features=2**20, stop_words = 'english')
cv=CountVectorizer()
word_count_vector=cv.fit_transform(abstracts)

In [0]:
sum_words = word_count_vector.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [0]:
#Lista das palavras mais frequentes
most_freqs = words_freq[:10]

# TF-IDF

In [0]:
#Ordena a matrix
def sort_coo(coo_matrix):
    #Array de tuplas (coluna, valor)
    tuples = zip(coo_matrix.col, coo_matrix.data) 
    #Ordena em ordem decrescente 
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)


#Extrai os mais relevantes
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

tfidf = TfidfVectorizer()
vectorized = tfidf.fit_transform(abstracts)

sorted_items=sort_coo(vectorized.tocoo())

feature_names= tfidf.get_feature_names()
keywords=extract_topn_from_vector(feature_names, sorted_items, 42215)

more_import_words = tfidf.inverse_transform(vectorized)    

# K-Means

In [0]:
#Calcula o número ideal de clusters, baseado no método Elbow
wcss = []
for i in range(1, 20):
    print(i)
    kmeans = MiniBatchKMeans(init='k-means++', n_clusters=i, max_iter=300, n_init=10, 
                         random_state=6)
    kmeans.fit(vectorized)
    wcss.append(kmeans.inertia_) #inertia = soma dos quadrados das distâncias dos pontos para o centróide

plt.plot(range(1, 20), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [0]:
#Calcula o número ideal de clusters
def optimal_number_of_clusters(wcss):
    x1, y1 = 1, wcss[0]
    x2, y2 = 20, wcss[len(wcss)-1]

    distances = []
    for i in range(len(wcss)):
        x0 = i+2
        y0 = wcss[i]
        numerator = abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1)
        denominator = np.sqrt((y2 - y1)**2 + (x2 - x1)**2)
        distances.append(numerator/denominator)
    
    return distances.index(max(distances)) + 2

print(optimal_number_of_clusters(wcss))

In [0]:
#Reduz para 2 dimensões para gerar o gráfico
pca = PCA(n_components = 9)
X_pca = pca.fit_transform(vectorized.toarray())

MiniBatchKMeans

In [0]:
kmeans = MiniBatchKMeans(init='k-means++', n_clusters=9, max_iter=300, n_init=10, 
                         random_state=6)

y_pred = kmeans.fit_predict(vectorized)

In [0]:
fig = plt.figure(figsize=plt.figaspect(0.5))

ax = fig.add_subplot(1, 1, 1)

ax.scatter(X_pca[:,0], X_pca[:,1], c=kmeans.predict(vectorized))

centers_on_pcs = pca.transform(kmeans.cluster_centers_)
ax.scatter(centers_on_pcs[:, 0], centers_on_pcs[:,1], marker='o', s=150, edgecolor='k')
plt.show()

In [0]:
# Demonstra as palavras mais importantes do cluster
#https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html
#Retorna os centroides em um array ordenado por valor  
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] #[:, ::-1] inverte o valor

terms = tfidf.get_feature_names()
for i in range(9):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind], end='')
    print()

In [0]:
#Gráfico dos clusters
palette = sns.color_palette('bright', len(set(y_pred)))
sns.scatterplot(X_pca[:,0], X_pca[:, 1], hue=y_pred, legend='full', palette=palette)
plt.title('Clustered Covid-19 Papers');
centers_on_pcs = pca.transform(kmeans.cluster_centers_)
plt.scatter(x=centers_on_pcs[:,0], y=centers_on_pcs[:,1], s=100, c="k", marker="X")
print(kmeans.labels_)
df_wc700['y'] = y_pred

In [0]:
#Distância euclidiana entre os clusters
from sklearn.metrics.pairwise import euclidean_distances
dists = euclidean_distances(kmeans.cluster_centers_)