## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# remember to install "LeWagon_FinalProject" package
from LeWagon_FinalProject.data import DataProcessor

# for LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# for NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from sklearn.model_selection import GridSearchCV

## Setup "LeWagon_FinalProject" Environment

### Create pyenv

In [None]:
# pyenv virtualenv LeWagon_FinalProject
# pyenv local LeWagon_FinalProject
# pip install --upgrade pip
#pip install -r https://gist.githubusercontent.com/krokrob/53ab953bbec16c96b9938fcaebf2b199/raw/9035bbf12922840905ef1fbbabc459dc565b79a3/minimal_requirements.txt

### Install project package

In [None]:
# Install LeWagon_FinalProject package
#!pip install -e .

### NLTK data

In [None]:
# Download nltk data
#import nltk
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')

## Data

In [3]:
dp = DataProcessor(csv_path='../raw_data/', csv_name='articles1')

#  Process the data, in case it was not processed
#dp.process_data()

# Load the data processed
df = dp.load_dataset_processed()

print(df.shape)
df.head()

(50000, 9)


Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,content
0,0,17283,house republican fret winning health care suit...,New York Times,Carl Hulse,2016-12-31,2016,12,washington congressional republican new fear c...
1,1,17284,rift officer resident killing persist south br...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017,6,bullet shell get counted blood dry votive cand...
2,2,17285,tyrus wong ‘ bambi artist thwarted racial bias...,New York Times,Margalit Fox,2017-01-06,2017,1,walt disney bambi opened 1942 critic praised s...
3,3,17286,among death 2016 heavy toll pop music new york...,New York Times,William McDonald,2017-04-10,2017,4,death may great equalizer necessarily evenhand...
4,4,17287,kim jong un say north korea preparing test lon...,New York Times,Choe Sang-Hun,2017-01-02,2017,1,seoul south korea north korea leader kim said ...


## Base Model - NMF and LDA using sklearn

In [None]:
# https://kitt.lewagon.com/camps/582/lectures/05-ML%2F10-Natural-Language-Processing#source
# https://kitt.lewagon.com/camps/582/lectures/06-Deep-Learning%2F05-Natural-Language-Processing#source
# https://shravan-kuchkula.github.io/topic-modeling/#interactive-data-visualization-showing-relation-between-clustering-sentiment-and-topics

#count_vectorizer = CountVectorizer()
#tfidf_vectorizer = TfidfVectorizer()

#count_vectorizer = CountVectorizer(min_df=10, max_df=0.95, ngram_range=(1,1), stop_words='english')
#tfidf_vectorizer = TfidfVectorizer(min_df=10, max_df=0.95, ngram_range=(1,1), stop_words='english')

count_vectorizer = CountVectorizer(min_df=10, max_df=0.95, ngram_range=(2,2), stop_words='english')
tfidf_vectorizer = TfidfVectorizer(min_df=10, max_df=0.95, ngram_range=(2,2), stop_words='english')

# calculate the feature matrix
feature_matrix = count_vectorizer.fit_transform(df['content'].astype('U').values)
tfidf_feature_matrix = tfidf_vectorizer.fit_transform(df['content'].astype('U').values)

print(feature_matrix.shape)
print(tfidf_feature_matrix.shape)

### NMF

#### Functions

In [None]:
#####################################
## Utility functions to help with NMF
# Code adapted from Sarkar text book
#####################################

# get topics with their terms and weights
def get_topics_terms_weights(weights, feature_names):
    feature_names = np.array(feature_names)
    sorted_indices = np.array([list(row[::-1]) for row in np.argsort(np.abs(weights))])
    sorted_weights = np.array([list(wt[index]) for wt, index in zip(weights, sorted_indices)])
    sorted_terms = np.array([list(feature_names[row]) for row in sorted_indices])

    topics = [np.vstack((terms.T, term_weights.T)).T for terms, term_weights in zip(sorted_terms, sorted_weights)]

    return topics


# prints components of all the topics
# obtained from topic modeling
def print_topics_udf(topics, total_topics=1,
                     weight_threshold=0.0001,
                     display_weights=False,
                     num_terms=None):

    for index in range(total_topics):
        topic = topics[index]
        topic = [(term, float(wt))
                 for term, wt in topic]
        #print(topic)
        topic = [(word, round(wt,2))
                 for word, wt in topic
                 if abs(wt) >= weight_threshold]

        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print(topic[:num_terms]) if num_terms else topic
        else:
            print('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print(tw[:num_terms]) if num_terms else tw

# prints components of all the topics
# obtained from topic modeling
def get_topics_udf(topics, total_topics=1,
                     weight_threshold=0.0001,
                     num_terms=None):

    topic_terms = []

    for index in range(total_topics):
        topic = topics[index]
        topic = [(term, float(wt))
                 for term, wt in topic]
        #print(topic)
        topic = [(word, round(wt,2))
                 for word, wt in topic
                 if abs(wt) >= weight_threshold]

        topic_terms.append(topic[:num_terms] if num_terms else topic)

    return topic_terms

def getTermsAndSizes(topic_display_list_item):
    terms = []
    sizes = []
    for term, size in topic_display_list_item:
        terms.append(term)
        sizes.append(size)
    return terms, sizes

#### Model

In [None]:
number_of_topics = 30
num_of_terms = 9
#nmf = NMF()
nmf = NMF(n_components=number_of_topics, random_state=43,  alpha=0.1, l1_ratio=0.5)
nmf_output = nmf.fit_transform(tfidf_feature_matrix)

nmf_feature_names = tfidf_vectorizer.get_feature_names()
nmf_weights = nmf.components_

In [None]:
topics = get_topics_terms_weights(nmf_weights, nmf_feature_names)
#print_topics_udf(topics, total_topics=number_of_topics, num_terms=num_of_terms, display_weights=True)

In [None]:
topics_display_list = get_topics_udf(topics, total_topics=number_of_topics, num_terms=num_of_terms)
print(len(topics_display_list[0]))
#topics_display_list

In [None]:
topic_columns = ['topic']
for i in range(num_of_terms):
    topic_columns.append(f'term_{i}')
    topic_columns.append(f'weight_{i}')

df_topics = pd.DataFrame(columns=topic_columns)
for i in range(number_of_topics):
    new_topic = {} 
    new_topic['topic'] = f'topic_{i}'
    for j in range(num_of_terms):
        new_topic[f'term_{j}'] = topics_display_list[i][j][0]
        new_topic[f'weight_{j}'] = topics_display_list[i][j][1]
    #print(new_topic)
    df_topics = df_topics.append(new_topic, ignore_index=True)
    #break
#df_topics.to_csv('../raw_data/NMFResults.csv', header=True, index=False, encoding='utf-8')
df_topics

In [None]:
'''
for i in range(number_of_topics):
    terms, sizes = getTermsAndSizes(topics_display_list[i])

    num_top_words = num_of_terms
    fontsize_base = num_of_terms / np.max(sizes)

    num_topics = 1

    for t in range(num_topics):
        fig, ax = plt.subplots(1, num_topics, figsize=(6, 12))
        plt.ylim(0, num_top_words + 1.0)
        plt.xticks([])
        plt.yticks([])
        plt.title('Topic #{}'.format(t))

        for i, (word, share) in enumerate(zip(terms, sizes)):
            word = word + " (" + str(share) + ")"
            plt.text(0.3, num_top_words-i-1.0, word, fontsize=fontsize_base*share)

    plt.tight_layout()
    sns.set(rc={'axes.facecolor':'cornflowerblue', 'figure.facecolor':'cornflowerblue'})
'''

### LDA

In [None]:
parei aqui
# Instantiate the LDA model
'''lda_model = LatentDirichletAllocation(n_components=30, max_iter=100, learning_method='online', random_state=43,
                                     batch_size=128, evaluate_every=-1, n_jobs=-1)

# fit transform the feature matrix
lda_output = lda_model.fit_transform(feature_matrix)

# display the lda_output and its shape
print(lda_output)
print(lda_output.shape)
'''

In [None]:
# print log-likelihood
#print("Log likelihood: ", lda_model.score(feature_matrix))

In [None]:
# print perplexity
#print("Perplexity: ", lda_model.perplexity(feature_matrix))

In [None]:
# Define Search Param
#search_params = {'n_components': [2, 3, 4, 5, 10, 15, 20, 25], 'learning_decay': [.5, .7, .9]}

#search_params = {'n_components': [30], 'learning_decay': [.5, .7, .9]}

search_params = {'n_components': [30], 'learning_decay': [.7]}

# Init the model
lda = LatentDirichletAllocation()

# Init Grid Search class
model = GridSearchCV(lda, search_params)

model.fit(feature_matrix)
best_lda_model = model.best_estimator_
print("Best model's params: ", model.best_params_)
print("Best log likelihood score: ", model.best_score_)
print("Model perplexity: ", best_lda_model.perplexity(feature_matrix))

In [None]:
df_cv_results = pd.DataFrame(model.cv_results_)
#df_cv_results.to_csv('../raw_data/LDAGridSearchResults.csv', header=True, index=False, encoding='utf-8')

In [None]:
sns.pointplot(x='param_n_components', y='mean_test_score', hue='param_learning_decay', data=df_cv_results)
sns.set(rc={'axes.facecolor':'cornflowerblue', 'figure.facecolor':'cornflowerblue'})

In [None]:
best_lda_model

In [None]:
# Create a document to topic matrix
lda_output = best_lda_model.transform(feature_matrix)

In [None]:
# column names
topicnames = ['Topic_' + str(i) for i in range(best_lda_model.n_components)]

# index names
clean_content_text = df['content'].to_list()
docnames = ['Doc_' + str(i) for i in range(len(clean_content_text))]

# create a dataframe
df_document_topic = pd.DataFrame(np.round(lda_output,2), columns=topicnames, index=docnames)

df_document_topic.head()

In [None]:
# dominant topic
df_document_topic['dominant_topic'] = np.argmax(df_document_topic.values, axis=1)
df_document_topic.head()

In [None]:
sns.countplot(df_document_topic.dominant_topic)
sns.set(rc={'axes.facecolor':'cornflowerblue', 'figure.facecolor':'cornflowerblue'})

In [None]:
#components_ contains the word to topic matrix
best_lda_model.components_.shape

In [None]:
# check the shape
feature_matrix.shape

In [None]:
# Topic - Keyword matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# assign column and index
df_topic_keywords.columns = count_vectorizer.get_feature_names()
df_topic_keywords.index = topicnames


# check the head
#df_topic_keywords.iloc[:,:10]

In [None]:
# Get the top 15 keywords from each topic
# Show top n keywords for each topic
def show_topics(vectorizer=count_vectorizer, lda_model=best_lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [None]:
topic_keywords = show_topics(count_vectorizer, best_lda_model, 20)

In [None]:
#topic_keywords

In [None]:
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
#df_topic_keywords.to_csv('../raw_data/LDAResults.csv', header=True, index=False, encoding='utf-8')
df_topic_keywords

## BERT

In [None]:
parou aqui
import tensorflow as tf
import transformers
from transformers import BertTokenizer
# https://huggingface.co/transformers/pretrained_models.html
# https://ipywidgets.readthedocs.io/en/stable/user_install.html
# bert-large-uncased
# !pip install transformers

# text = df['content'].to_list()
text = df['content'].values

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

input_ids = []
lengths = []

for txt in text:
    enconded_sent = tokenizer.encode(str(txt),
                                     add_special_tokens=True,
                                     return_tensors='tf',
                                     max_length=512,
                                     truncation=True
                                    )
    input_ids.append(enconded_sent)
    lengths.append(len(enconded_sent[0]))

print(len(input_ids))
print(len(lengths))

## BERT topic

In [1]:
# pip install bertopic
# Topic Modeling with BERT Transformers
# https://www.youtube.com/watch?v=TLPmlVeEf1k
# https://maartengr.github.io/BERTopic/index.html
# https://maartengr.github.io/BERTopic/tutorial/visualization/visualization.html
# https://maartengr.github.io/BERTopic/tutorial/topicreduction/topicreduction.html
# https://www.atoti.io/topic-modeling-on-twitter-using-sentence-bert/
# https://hdbscan.readthedocs.io/_/downloads/en/stable/pdf/

from bertopic import BERTopic

dp = DataProcessor(csv_path='../raw_data/', csv_name='articles1')
df = dp.load_dataset()

docs = df['content'].values

topic_model = BERTopic(min_topic_size=150, language='english', calculate_probabilities=True)
#topic_model = BERTopic(language='english', calculate_probabilities=True)
topics, _ = topic_model.fit_transform(docs)

NameError: name 'DataProcessor' is not defined

In [None]:
topic_freq = topic_model.get_topic_freq()
outliers = topic_freq['Count'][topic_freq['Topic']==-1].iloc[0]
print(f'{outliers} documents have not been classified')
print(f'The other {topic_freq["Count"].sum() - outliers} documents are {topic_freq["Topic"].shape[0]-1} topics')

In [None]:
topic_freq.head()

In [None]:
print(f'There are {topic_freq["Count"].iloc[1]} documents that are talking about topic ID {topic_freq["Topic"].iloc[1]}')

In [None]:
topic_model.get_topic(topic_freq["Topic"].iloc[1])

In [None]:
df_topic_info = topic_model.get_topic_info()

#df_topic_info.to_csv('../raw_data/BERTopicInfo.csv', header=True, index=False, encoding='utf-8')
df_topic_info

In [None]:
topics = topic_model.get_topics()
number_of_topics = len(topics)-1
num_of_terms = len(topics[0])

topic_columns = ['topic']
for i in range(num_of_terms):
    topic_columns.append(f'term_{i}')
    topic_columns.append(f'weight_{i}')

df_topics = pd.DataFrame(columns=topic_columns)
for i in range(-1,number_of_topics):
    new_topic = {} 
    new_topic['topic'] = topic_model.topic_names[i] #f'topic_{i}'
    for j in range(num_of_terms):
        new_topic[f'term_{j}'] = topics[i][j][0]
        new_topic[f'weight_{j}'] = topics[i][j][1]
    df_topics = df_topics.append(new_topic, ignore_index=True)

#df_topics.to_csv('../raw_data/BERTopicResults.csv', header=True, index=False, encoding='utf-8')
df_topics

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
#topic_model.visualize_topics_over_time()
#topic_model.topics_per_class(docs, topics, classes)
print(topic_model.topic_sim_matrix.shape)
topic_model.topic_sim_matrix

In [None]:
similar_topics, similarity = topic_model.find_topics(topic_model.topic_names[43], top_n=3)
print(topic_model.topic_names[similar_topics[0]])
topic_model.get_topic(similar_topics[0])

In [None]:
similar_topics, similarity = topic_model.find_topics('health', top_n=5)
print(topic_model.topic_names[similar_topics[0]])
print(topic_model.get_topic(similar_topics[0]))
print(topic_model.topic_names[similar_topics[1]])
print(topic_model.get_topic(similar_topics[1]))
print(topic_model.topic_names[similar_topics[2]])
print(topic_model.get_topic(similar_topics[2]))

In [None]:
topics = topic_model.get_topics()
number_of_topics = len(topics)-1
top_similarity = 11

topic_columns = ['topic']
for i in range(top_similarity):
    topic_columns.append(f'topic_{i}')
    topic_columns.append(f'weight_{i}')

df_topics = pd.DataFrame(columns=topic_columns)
for i in range(-1,number_of_topics):
    new_topic = {} 
    new_topic['topic'] = topic_model.topic_names[i] 
    similar_topics, similarity = topic_model.find_topics(topic_model.topic_names[i], top_n=top_similarity)
    for j in range(top_similarity):
        new_topic[f'topic_{j}'] = topic_model.topic_names[similar_topics[j]]
        new_topic[f'weight_{j}'] = round(similarity[j], 4)
    df_topics = df_topics.append(new_topic, ignore_index=True)

#df_topics.to_csv('../raw_data/BERTopicSimilarity.csv', header=True, index=False, encoding='utf-8')
df_topics

In [None]:
topic_model.

In [None]:
def correlation_matrix_to_df(df_corr):
    list_done = []
    lits_item1 = []
    lits_item2 = []
    list_corr = []

    for k in range(1,df_corr.shape[1]):
        for i, j in df_corr.iterrows():
            #if (df_corr.columns[k] != j[0]) and (j[0] not in list_done):
            if (j[0] not in list_done):
                lits_item1.append(df_corr.columns[k])
                lits_item2.append(j[0])
                list_corr.append(j[k])
        list_done.append(df_corr.columns[k])

    corr_dict = {'ITEM1': lits_item1,
                 'ITEM2': lits_item2,
                 'CORR': list_corr}
    df_res = pd.DataFrame(corr_dict)
    df_res = df_res.sort_values(by='CORR', ascending=False).copy()
    df_res.reset_index(inplace=True,drop=True)
    return df_res

In [None]:
corr_matrix = topic_model.topic_sim_matrix

topics = topic_model.get_topics()
number_of_topics = len(topics)-1

topic_columns = ['topic']
for i in range(-1,number_of_topics):
    topic_columns.append(topic_model.topic_names[i])

df_similarity = pd.DataFrame(columns=topic_columns)
for i in range(-1,number_of_topics):
    new_topic = {}
    new_topic['topic'] = topic_model.topic_names[i]
    similar_topics, similarity = topic_model.find_topics(topic_model.topic_names[i], top_n=top_similarity)
    for j in range(-1,number_of_topics):
        new_topic[topic_model.topic_names[j]] = round(corr_matrix[i,j], 4)
    df_similarity = df_similarity.append(new_topic, ignore_index=True)

df_similarity.to_csv('../raw_data/BERTopicSimilarity.csv', header=True, index=False, encoding='utf-8')
df_similarity

In [None]:
df_similarity_simpl = correlation_matrix_to_df(df_similarity)
#df_similarity_simpl.to_csv('../raw_data/BERTopicSimilaritySimpl.csv', header=True, index=False, encoding='utf-8')
df_similarity_simpl

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
embeddings = model.encode(docs)
embeddings.shape

In [None]:
def get_most_relevant_documents(cluster_id, condensed_tree):
          
    assert cluster_id > -1, "The topic's label should be greater than -1!"
        
    raw_tree = condensed_tree._raw_tree
    
    # Just the cluster elements of the tree, excluding singleton points
    cluster_tree = raw_tree[raw_tree['child_size'] > 1]
    
    # Get the leaf cluster nodes under the cluster we are considering
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
    
    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf)
    result = np.array([])
    
    for leaf in leaves:
        max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
        points = raw_tree['child'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        result = np.hstack((result, points))
        
    return result.astype(np.int)

In [None]:
clusterer = topic_model.hdbscan_model
tree = clusterer.condensed_tree_
clusters = tree._select_clusters()
len(clusters)

In [None]:
import hdbscan
# Get the clusterer model, the clusters' tree and the clusters (topics ids)
clusterer = topic_model.hdbscan_model
tree = clusterer.condensed_tree_
clusters = tree._select_clusters()

# Get the ids of the most relevant documents (exemplars) associated with the topic at index idx
c_exemplars = get_most_relevant_documents(clusters[1], tree)
c_exemplars

In [None]:
len(c_exemplars)

In [None]:
print(topic_model.topic_names[1])
docs[9458]

In [None]:
cluster_id = clusters[1]
condensed_tree = tree


raw_tree = condensed_tree._raw_tree

# Just the cluster elements of the tree, excluding singleton points
cluster_tree = raw_tree[raw_tree['child_size'] > 1]

# Get the leaf cluster nodes under the cluster we are considering
leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
print(leaves)

# Now collect up the last remaining points of each leaf cluster (the heart of the leaf)
result = np.array([])


for leaf in leaves:
    max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
    print(f'leaf = {leaf} max_lambda = {max_lambda}')
    points = raw_tree[['child', 'lambda_val']][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
    #points = raw_tree[['child', 'lambda_val']][(raw_tree['parent'] == leaf)]
    #print(points)
    #result = np.hstack((result, points))

#raw_tree[['parent', 'child', 'lambda_val']]

In [None]:
#!pip install networkx

In [None]:
#!pip install networkx
import networkx
clusterer = topic_model.hdbscan_model
tree = clusterer.condensed_tree_
clusters = tree._select_clusters()

xx = clusterer.condensed_tree_.to_networkx()
#type(clusterer)
xx

In [None]:
#xx.number_of_nodes()
plt.figure(figsize=(12,12))
edges = xx.edges()
pos = networkx.spring_layout(xx, k = 0.5) # k regulates the distance between nodes
weights = [xx[u][v]['weight'] for u,v in edges]
networkx.draw(xx, with_labels=True, node_color='skyblue', font_weight='bold',  width=weights, pos=pos)


## Test