## Imports

In [1]:
import pandas as pd
import numpy as np
from LeWagon_FinalProject.data import DataProcessor
from bertopic import BERTopic
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity

2021-07-20 19:04:04.647904: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Functions

In [2]:
def generate_docs(df_, number_of_docs):
    df_ = df_[['date', 'content']][0:number_of_docs].copy().reset_index(drop=True)
    df_.to_csv(f'../raw_data/BERTDocsContent_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_.copy()

In [3]:
def generate_topic_info(bert_model, number_of_docs):
    df_topic_info = bert_model.get_topic_info()

    df_topic_info.to_csv(f'../raw_data/BERTopicInfo_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_topic_info.copy()

In [4]:
def generate_terms(bert_model, number_of_docs):
    topics = bert_model.get_topics()
    number_of_topics = len(topics)-1
    num_of_terms = len(topics[0])

    topic_columns = ['topic', 'term', 'weight']

    df_topics = pd.DataFrame(columns=topic_columns)
    for i in range(-1,number_of_topics): 
        for j in range(num_of_terms):
            new_topic = {}
            new_topic['topic'] = topic_model.topic_names[i]
            new_topic['term'] = topics[i][j][0]
            new_topic['weight'] = round(topics[i][j][1],6)
            df_topics = df_topics.append(new_topic, ignore_index=True)

    df_topics.to_csv(f'../raw_data/BERTopicTerms_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_topics.copy()

In [5]:
def correlation_matrix_to_df(df_corr):
    list_done = []
    lits_item1 = []
    lits_item2 = []
    list_corr = []

    for k in range(1,df_corr.shape[1]):
        for i, j in df_corr.iterrows():
            #if (df_corr.columns[k] != j[0]) and (j[0] not in list_done):
            #if (j[0] not in list_done):
            lits_item1.append(df_corr.columns[k])
            lits_item2.append(j[0])
            list_corr.append(j[k])
        list_done.append(df_corr.columns[k])

    corr_dict = {'topic1': lits_item1,
                 'topic2': lits_item2,
                 'similarity': list_corr}
    df_res = pd.DataFrame(corr_dict)
    df_res = df_res.sort_values(by='similarity', ascending=False).copy()
    df_res.reset_index(inplace=True,drop=True)
    return df_res.copy()

In [6]:
def generate_topic_similarity(bert_model, number_of_docs):
    corr_matrix = bert_model.topic_sim_matrix

    topics = bert_model.get_topics()
    number_of_topics = len(topics)-1

    topic_columns = ['topic']
    for i in range(-1,number_of_topics):
        topic_columns.append(bert_model.topic_names[i])

    df_similarity = pd.DataFrame(columns=topic_columns)
    for i in range(-1,number_of_topics):
        new_topic = {}
        new_topic['topic'] = bert_model.topic_names[i]
        for j in range(-1,number_of_topics):
            new_topic[bert_model.topic_names[j]] = round(corr_matrix[i,j],6)
        df_similarity = df_similarity.append(new_topic, ignore_index=True)
        
    df_topic_similarity = correlation_matrix_to_df(df_similarity)
    df_topic_similarity.to_csv(f'../raw_data/BERTopicSimilarity_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_topic_similarity.copy()

In [7]:
def get_topic_documents(cluster_id, condensed_tree):
    result_points = np.array([])
    result_points_val = np.array([])
    
    #assert cluster_id > -1, "The topic's label should be greater than -1!"
    
    if cluster_id <= -1:
        return result_points.astype(np.int64), result_points_val.astype(np.float64)
        
    raw_tree = condensed_tree._raw_tree
    
    # Just the cluster elements of the tree, excluding singleton points
    cluster_tree = raw_tree[raw_tree['child_size'] > 1]
    
    # Get the leaf cluster nodes under the cluster we are considering
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
    
    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf) 
    for leaf in leaves:
        #max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
        #points = raw_tree['child'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        #points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        points = raw_tree['child'][(raw_tree['parent'] == leaf)]
        points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf)]
        result_points = np.hstack((result_points, points))
        result_points_val = np.hstack((result_points_val, points_val))
        
    return result_points.astype(np.int64), result_points_val.astype(np.float64)

In [8]:
def generate_topic_documents(bert_model, number_of_docs):
    clusterer = bert_model.hdbscan_model
    tree = clusterer.condensed_tree_
    clusters = tree._select_clusters()

    number_of_topics = len(clusters)

    relevant_columns = ['topic', 'document', 'lambda_val']
    df_rel_docs = pd.DataFrame(columns=relevant_columns)

    for i in range(0, number_of_topics):
        rel_docs, lambda_vals = get_topic_documents(clusters[i], tree)
        topic_name = bert_model.topic_names[i]
        for j in range(0, len(rel_docs)):
            new_doc_rel = {}
            new_doc_rel['topic'] = topic_name
            new_doc_rel['document'] = rel_docs[j]
            new_doc_rel['lambda_val'] = round(lambda_vals[j],6)
            df_rel_docs = df_rel_docs.append(new_doc_rel, ignore_index=True)

    df_rel_docs.to_csv(f'../raw_data/BERTopicDocuments_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_rel_docs.copy()

In [9]:
def get_cosine_similarity(feature_vec_1, feature_vec_2):
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

In [10]:
def generate_documents_similarity(bert_model, docs, number_of_docs):
    emb_model = bert_model.embedding_model
    
    # Create documents embeddings
    embeddings = emb_model.embedding_model.encode(docs)
    doc_sim_matrix = cosine_similarity(embeddings, embeddings)
    np.savetxt(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.csv', doc_sim_matrix, delimiter=',')
    np.save(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.npy', doc_sim_matrix)
    return doc_sim_matrix
'''
    sim_columns = ['cosine_similarity', 'document1', 'document2']
    df_sim_docs = pd.DataFrame(columns=sim_columns)
    temp_columns = ['cosine_similarity']
    for i in range(0, len(docs)):
        docs_sim = df_documents_similarity[i]
        df_sim_docs_temp = pd.DataFrame(data = docs_sim, columns=temp_columns)
        df_sim_docs_temp['document1'] = i
        df_sim_docs_temp['document2'] = df_sim_docs_temp.index
        df_sim_docs = df_sim_docs.append(df_sim_docs_temp, ignore_index=True)
    
    #df_sim_docs.to_csv(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')     
    return df_sim_docs.copy()
'''

"\n    sim_columns = ['cosine_similarity', 'document1', 'document2']\n    df_sim_docs = pd.DataFrame(columns=sim_columns)\n    temp_columns = ['cosine_similarity']\n    for i in range(0, len(docs)):\n        docs_sim = df_documents_similarity[i]\n        df_sim_docs_temp = pd.DataFrame(data = docs_sim, columns=temp_columns)\n        df_sim_docs_temp['document1'] = i\n        df_sim_docs_temp['document2'] = df_sim_docs_temp.index\n        df_sim_docs = df_sim_docs.append(df_sim_docs_temp, ignore_index=True)\n    \n    #df_sim_docs.to_csv(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')     \n    return df_sim_docs.copy()\n"

## Generate data

In [11]:
number_of_docs = 3_000

dp = DataProcessor(csv_path='../raw_data/', csv_name='articles1')
df = dp.load_dataset()

In [12]:
df_docs = generate_docs(df, number_of_docs)
print(df_docs.shape)
df_docs.head(1)

(3000, 2)


Unnamed: 0,date,content
0,2016-12-31,WASHINGTON — Congressional Republicans have...


In [13]:
topic_model = BERTopic.load('../raw_data/bert_model_2_2_3000')

In [14]:
df_topics_info = generate_topic_info(topic_model, number_of_docs)
print(df_topics_info.shape)
df_topics_info.head(1)

(62, 3)


Unnamed: 0,Topic,Count,Name
0,-1,1143,-1_he said_mr trump_he was_that he


In [15]:
df_terms = generate_terms(topic_model, number_of_docs)
print(df_terms.shape)
df_terms.head(1)

(620, 3)


Unnamed: 0,topic,term,weight
0,-1_he said_mr trump_he was_that he,he said,0.00368


In [16]:
df_topic_similarity = generate_topic_similarity(topic_model, number_of_docs)
print(df_topic_similarity.shape)
df_topic_similarity.head(1)

(3844, 3)


Unnamed: 0,topic1,topic2,similarity
0,-1_he said_mr trump_he was_that he,-1_he said_mr trump_he was_that he,1.0


In [17]:
df_topic_documents = generate_topic_documents(topic_model, number_of_docs)
print(df_topic_documents.shape)
df_topic_documents.head(1)

(1709, 3)


Unnamed: 0,topic,document,lambda_val
0,0_the new_new york_york times_climate change,2204,6.0245


In [18]:
%%time
docs = df_docs['content'].values
matrix_documents_similarity = generate_documents_similarity(topic_model, docs, number_of_docs)
print(matrix_documents_similarity.shape)
matrix_documents_similarity

(3000, 3000)
CPU times: user 9min 38s, sys: 16 s, total: 9min 54s
Wall time: 1min 45s


array([[ 1.0000004 ,  0.04089347, -0.04203755, ...,  0.21397781,
         0.17506757,  0.2757412 ],
       [ 0.04089347,  0.9999999 ,  0.14799452, ...,  0.08324938,
        -0.0160458 ,  0.20328471],
       [-0.04203755,  0.14799452,  1.0000001 , ...,  0.18919231,
         0.17196642,  0.16383162],
       ...,
       [ 0.21397781,  0.08324938,  0.18919231, ...,  1.0000001 ,
         0.6301272 ,  0.59342474],
       [ 0.17506757, -0.0160458 ,  0.17196642, ...,  0.6301272 ,
         0.9999997 ,  0.65144277],
       [ 0.2757412 ,  0.20328471,  0.16383162, ...,  0.59342474,
         0.65144277,  1.0000001 ]], dtype=float32)

## Data processed

In [19]:
number_of_docs = 3_000

df_docs_content = pd.read_csv(f'../raw_data/BERTDocsContent_{str(number_of_docs)}.csv')
print(df_docs_content.shape)
df_docs_content.head(1)

(3000, 2)


Unnamed: 0,date,content
0,2016-12-31,WASHINGTON — Congressional Republicans have...


In [20]:
df_topic_info = pd.read_csv(f'../raw_data/BERTopicInfo_{str(number_of_docs)}.csv')
print(df_topic_info.shape)
df_topic_info.head(1)

(62, 3)


Unnamed: 0,Topic,Count,Name
0,-1,1143,-1_he said_mr trump_he was_that he


In [21]:
df_topic_docs = pd.read_csv(f'../raw_data/BERTopicDocuments_{str(number_of_docs)}.csv')
print(df_topic_docs.shape)
df_topic_docs.head(1)

(1709, 3)


Unnamed: 0,topic,document,lambda_val
0,0_the new_new york_york times_climate change,2204,6.0245


In [22]:
df_topic_terms = pd.read_csv(f'../raw_data/BERTopicTerms_{str(number_of_docs)}.csv')
print(df_topic_terms.shape)
df_topic_terms.head(1)

(620, 3)


Unnamed: 0,topic,term,weight
0,-1_he said_mr trump_he was_that he,he said,0.00368


In [23]:
df_topic_sim = pd.read_csv(f'../raw_data/BERTopicSimilarity_{str(number_of_docs)}.csv')
print(df_topic_sim.shape)
df_topic_sim.head(1)

(3844, 3)


Unnamed: 0,topic1,topic2,similarity
0,-1_he said_mr trump_he was_that he,-1_he said_mr trump_he was_that he,1.0


In [24]:
#matrix_docs_similarity = np.loadtxt(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.csv', delimiter=',')
matrix_docs_similarity = np.load(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.npy')
print(matrix_docs_similarity.shape)
matrix_docs_similarity

(3000, 3000)


array([[ 1.0000004 ,  0.04089347, -0.04203755, ...,  0.21397781,
         0.17506757,  0.2757412 ],
       [ 0.04089347,  0.9999999 ,  0.14799452, ...,  0.08324938,
        -0.0160458 ,  0.20328471],
       [-0.04203755,  0.14799452,  1.0000001 , ...,  0.18919231,
         0.17196642,  0.16383162],
       ...,
       [ 0.21397781,  0.08324938,  0.18919231, ...,  1.0000001 ,
         0.6301272 ,  0.59342474],
       [ 0.17506757, -0.0160458 ,  0.17196642, ...,  0.6301272 ,
         0.9999997 ,  0.65144277],
       [ 0.2757412 ,  0.20328471,  0.16383162, ...,  0.59342474,
         0.65144277,  1.0000001 ]], dtype=float32)