# Trabalho BI - Tópicos Avançados - Alocação Latente de Dirichlet (LDA)

## Análise de tópicos utilizando a base de serviços das reclamações fundamentadas que foram audiência no ano de 2017.

In [None]:
import numpy as np
import pandas as pd
import gensim
import nltk


# Sklearn
from sklearn.decomposition import LatentDirichletAllocation #TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer #, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
#from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
#import matplotlib.colors as mcolors

%matplotlib inline

## Importando os Dados

In [None]:
# Carregamento da base
data = pd.read_csv(r'C:\Users\rfsantos\OneDrive - Mongeral Aegon\2020\Modulos\BI\NLP\Trabalho\Arquivos fonte finais\base reclamacoes Procon.csv', sep = ';', encoding ='windows-1252', keep_default_na = True)

## Pré-processamento

In [None]:
# Consideramos cada linha de serviço como um documento
data = [serviço for serviço in data.serviço] 
print("Temos %d documentos." %len(data))

In [None]:
# Tokenização dos docs
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True remove pontuação

data_words = list(sent_to_words(data))

print(data_words[:50])

In [None]:
# Removendo Stopwords
def removeStops(texts, stopwords):
    texts_out = []
    for sent in texts:
        texts_out.append(" ".join([token for token in sent if token not in stopwords]))
    return texts_out


stopwords = nltk.corpus.stopwords.words('portuguese')
stopwords += ["nao", "so", "pra", "pro", "pras", "pros", "etc", "outros"]
data_without_stops = removeStops(data_words, stopwords)

# sem stopwords
print(data_without_stops[:50])

## Criando a matriz Documento-Palavra

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # considerar palavras que ocorreram pelo menos 10 vezes (min_df) 
                             lowercase=True,                   # converter todas as palavras em minúsculas
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3 - para ser qualificado como token
                            )

data_vectorized = vectorizer.fit_transform(data_without_stops)

## Usando GridSearch para encontrar melhor modelo LDA


In [None]:
# Parâmetros de busca
search_params = {'n_components': [5, 10, 15], 'learning_decay': [.5, .7, .9]}

# Inicializa o modelo
lda = LatentDirichletAllocation()

# Inicializa Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Faz a Grid Search
model.fit(data_vectorized)

## Escolhendo o "melhor" modelo

In [None]:
# Melhor modelo
best_lda_model = model.best_estimator_

# Hiperparâmetros do modelo
print("Melhores parâmetros: ", model.best_params_)

# probabilidade logarítmica
print("Melhor score de probabilidade logarítmica: ", model.best_score_)

# Perplexidade
print("Perplexidade do modelo: ", best_lda_model.perplexity(data_vectorized))

## Comparando os scores de performance dos modelos LDA

In [None]:
results = pd.DataFrame(model.cv_results_)

current_palette = sns.color_palette("Set2", 3)

plt.figure(figsize=(12,8))

sns.lineplot(data=results,
             x='param_n_components',
             y='mean_test_score',
             hue='param_learning_decay',
             palette=current_palette,
             marker='o'
            )

plt.show()

## Tópico dominante em cada documento

In [None]:
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.style.applymap(color_green).applymap(make_bold)
df_document_topics_first10 = df_document_topic[:10].style.applymap(color_green).applymap(make_bold)
df_document_topics_first10

## Quantidade de Documentos em Cada Tópico

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

## Visualizando o modelo LDA com o pyLDAvis

In [None]:
pyLDAvis.enable_notebook() 
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
pyLDAvis.save_html(panel, 'lda.html') 
pyLDAvis.display(panel)

## Top 10 palavras por tópico

In [None]:
vocab = vectorizer.get_feature_names()

# data_vectorized
topic_words = {}
n_top_words = 10

for topic, comp in enumerate(best_lda_model.components_):
    # for the n-dimensional array "arr":
    # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
    # which contains the indices that would sort arr in a descending fashion
    # for the ith element in ranked_array, ranked_array[i] represents the index of the
    # element in arr that should be at the ith index in ranked_array
    # ex. arr = [3,7,1,0,3,6]
    # np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
    # word_idx contains the indices in "topic" of the top num_top_words most relevant
    # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)    
    word_idx = np.argsort(comp)[::-1][:n_top_words]

    # store the words most relevant to the topic
    topic_words[topic] = [vocab[i] for i in word_idx]

    
for topic, words in topic_words.items():
    words = ', '.join(words)
    words = str(words)
    cloud1 = WordCloud(background_color='black',width=1600, height=800,max_font_size=200,max_words=20,collocations=False).generate(words)

    plt.figure(figsize = (20,10))
    plt.imshow(cloud1, interpolation='bilinear')
    plt.title('Tópico ' + str(topic), fontsize=20)
    plt.axis('off')
    plt.tight_layout(pad=0)

In [None]:
words