## Imports

In [56]:
import pandas as pd
import numpy as np
from LeWagon_FinalProject.data import DataProcessor
from bertopic import BERTopic
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity

## Functions

In [57]:
def generate_docs(df_, number_of_docs):
    df_ = df_[['date', 'content']][0:number_of_docs].copy().reset_index(drop=True)
    df_.to_csv(f'../raw_data/BERTDocsContent_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_.copy()

In [58]:
def generate_topic_info(bert_model, number_of_docs):
    df_topic_info = bert_model.get_topic_info()

    df_topic_info.to_csv(f'../raw_data/BERTopicInfo_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_topic_info.copy()

In [59]:
def generate_terms(bert_model, number_of_docs):
    topics = bert_model.get_topics()
    number_of_topics = len(topics)-1
    
    topic_columns = ['topic', 'term', 'weight']

    df_topics = pd.DataFrame(columns=topic_columns)
    for i in range(-1,number_of_topics): 
        num_of_terms = len(topics[i])
        for j in range(num_of_terms):
            new_topic = {}
            new_topic['topic'] = topic_model.topic_names[i]
            new_topic['term'] = topics[i][j][0]
            new_topic['weight'] = round(topics[i][j][1],6)
            df_topics = df_topics.append(new_topic, ignore_index=True)

    df_topics.to_csv(f'../raw_data/BERTopicTerms_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_topics.copy()

In [60]:
def correlation_matrix_to_df(df_corr):
    list_done = []
    lits_item1 = []
    lits_item2 = []
    list_corr = []

    for k in range(1,df_corr.shape[1]):
        for i, j in df_corr.iterrows():
            #if (df_corr.columns[k] != j[0]) and (j[0] not in list_done):
            #if (j[0] not in list_done):
            lits_item1.append(df_corr.columns[k])
            lits_item2.append(j[0])
            list_corr.append(j[k])
        list_done.append(df_corr.columns[k])

    corr_dict = {'topic1': lits_item1,
                 'topic2': lits_item2,
                 'similarity': list_corr}
    df_res = pd.DataFrame(corr_dict)
    df_res = df_res.sort_values(by='similarity', ascending=False).copy()
    df_res.reset_index(inplace=True,drop=True)
    return df_res.copy()

In [61]:
def generate_topic_similarity(bert_model, number_of_docs):
    corr_matrix = bert_model.topic_sim_matrix

    topics = bert_model.get_topics()
    number_of_topics = len(topics)-1

    topic_columns = ['topic']
    for i in range(-1,number_of_topics):
        topic_columns.append(bert_model.topic_names[i])

    df_similarity = pd.DataFrame(columns=topic_columns)
    for i in range(-1,number_of_topics):
        new_topic = {}
        new_topic['topic'] = bert_model.topic_names[i]
        for j in range(-1,number_of_topics):
            new_topic[bert_model.topic_names[j]] = round(corr_matrix[i,j],6)
        df_similarity = df_similarity.append(new_topic, ignore_index=True)
        
    df_topic_similarity = correlation_matrix_to_df(df_similarity)
    df_topic_similarity.to_csv(f'../raw_data/BERTopicSimilarity_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_topic_similarity.copy()

In [62]:
def get_topic_documents(cluster_id, condensed_tree):
    result_points = np.array([])
    result_points_val = np.array([])
    
    #assert cluster_id > -1, "The topic's label should be greater than -1!"
    
    if cluster_id <= -1:
        return result_points.astype(np.int64), result_points_val.astype(np.float64)
        
    raw_tree = condensed_tree._raw_tree
    
    # Just the cluster elements of the tree, excluding singleton points
    cluster_tree = raw_tree[raw_tree['child_size'] > 1]
    
    # Get the leaf cluster nodes under the cluster we are considering
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
    
    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf) 
    for leaf in leaves:
        #max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
        #points = raw_tree['child'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        #points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        points = raw_tree['child'][(raw_tree['parent'] == leaf)]
        points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf)]
        result_points = np.hstack((result_points, points))
        result_points_val = np.hstack((result_points_val, points_val))
        
    return result_points.astype(np.int64), result_points_val.astype(np.float64)

In [63]:
def generate_topic_documents(bert_model, number_of_docs):
    clusterer = bert_model.hdbscan_model
    tree = clusterer.condensed_tree_
    clusters = tree._select_clusters()

    number_of_topics = len(clusters)

    relevant_columns = ['topic', 'document', 'lambda_val']
    df_rel_docs = pd.DataFrame(columns=relevant_columns)
        
    if number_of_topics == len(bert_model.get_topics()):
        start_ind = -1
    else:
        start_ind = 0

    for i in range(0, number_of_topics):
        rel_docs, lambda_vals = get_topic_documents(clusters[i], tree)
        if len(rel_docs) > 0:
            if start_ind < 0:
                topic_name = bert_model.topic_names[i-1]
            else:
                topic_name = bert_model.topic_names[i]
                
            for j in range(0, len(rel_docs)):
                new_doc_rel = {}
                new_doc_rel['topic'] = topic_name
                new_doc_rel['document'] = rel_docs[j]
                new_doc_rel['lambda_val'] = round(lambda_vals[j],6)
                df_rel_docs = df_rel_docs.append(new_doc_rel, ignore_index=True)

    df_rel_docs.to_csv(f'../raw_data/BERTopicDocuments_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_rel_docs.copy()

In [64]:
def get_cosine_similarity(feature_vec_1, feature_vec_2):
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

In [65]:
def generate_documents_similarity(bert_model, docs, number_of_docs):
    emb_model = bert_model.embedding_model
    
    # Create documents embeddings
    embeddings = emb_model.embedding_model.encode(docs)
    doc_sim_matrix = cosine_similarity(embeddings, embeddings)
    np.savetxt(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.csv', doc_sim_matrix, delimiter=',')
    np.save(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.npy', doc_sim_matrix)
    return doc_sim_matrix
'''
    sim_columns = ['cosine_similarity', 'document1', 'document2']
    df_sim_docs = pd.DataFrame(columns=sim_columns)
    temp_columns = ['cosine_similarity']
    for i in range(0, len(docs)):
        docs_sim = df_documents_similarity[i]
        df_sim_docs_temp = pd.DataFrame(data = docs_sim, columns=temp_columns)
        df_sim_docs_temp['document1'] = i
        df_sim_docs_temp['document2'] = df_sim_docs_temp.index
        df_sim_docs = df_sim_docs.append(df_sim_docs_temp, ignore_index=True)
    
    #df_sim_docs.to_csv(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')     
    return df_sim_docs.copy()
'''

"\n    sim_columns = ['cosine_similarity', 'document1', 'document2']\n    df_sim_docs = pd.DataFrame(columns=sim_columns)\n    temp_columns = ['cosine_similarity']\n    for i in range(0, len(docs)):\n        docs_sim = df_documents_similarity[i]\n        df_sim_docs_temp = pd.DataFrame(data = docs_sim, columns=temp_columns)\n        df_sim_docs_temp['document1'] = i\n        df_sim_docs_temp['document2'] = df_sim_docs_temp.index\n        df_sim_docs = df_sim_docs.append(df_sim_docs_temp, ignore_index=True)\n    \n    #df_sim_docs.to_csv(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')     \n    return df_sim_docs.copy()\n"

In [66]:
def get_topic_start_end_dates(mode_index, topic_index):
    df_topic = pd.read_csv(f'../raw_data/BERTopicInfo_{str(mode_index)}.csv')
    topic_name = df_topic[df_topic['Topic']==topic_index]['Name'].values[0]

    df_documents = pd.read_csv(f'../raw_data/BERTopicDocuments_{str(mode_index)}.csv')
    df_documents = df_documents[df_documents['topic']==topic_name]

    df_docscontent = pd.read_csv(f'../raw_data/BERTopicDocsContent_{str(mode_index)}.csv', parse_dates=True)
    df_docscontent = df_docscontent[df_docscontent.index.isin(df_documents['document'].values)]
    start_date = df_docscontent['date'].min()
    end_date = df_docscontent['date'].max()
    number_topic_docs = len(df_documents)
    return start_date, end_date, number_topic_docs

## Generate data

In [None]:
'''dp1 = DataProcessor(csv_path='../raw_data/', csv_name='articles1')
df1 = dp1.load_dataset()

dp2 = DataProcessor(csv_path='../raw_data/', csv_name='articles2')
df2 = dp2.load_dataset()

dp3 = DataProcessor(csv_path='../raw_data/', csv_name='articles3')
df3 = dp3.load_dataset()
'''

In [None]:
'''df_all = df1.copy()
df_all = df_all.append(df2, ignore_index=True)
df_all = df_all.append(df3, ignore_index=True)
df_all = df_all.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)
'''

In [None]:
'''print(df_all.shape)
#df_all[(df_all['year'].isna())].dropna(inplace=True)
df_all.dropna(subset=['year', 'month'], inplace=True)
print(df_all.shape)
df_all
'''

In [None]:
'''df_all = df_all[['id', 'title', 'year', 'month', 'content']].copy()
df_all = df_all[df_all['year'] >= 2015].copy()
df_all
'''

In [None]:
#df_all.to_csv(f'../raw_data/dataset_work.csv', header=True, index=True, encoding='utf-8')

In [None]:
'''import requests
url = 'https://bucketapipython-guadc7haza-uc.a.run.app/data'

params = {'filename': 'dataset_work.csv', 'data': df_all.to_json()}

x = requests.post(url, params=params)
print(x.text)
'''

In [None]:
'''import requests
url = 'https://bucketapipython-guadc7haza-uc.a.run.app/data'
params = {'filename': 'dataset_work', 'extension': 'csv'}
x = requests.get(url, params=params)
x'''

In [71]:
number_of_docs = 3_000

dp = DataProcessor(csv_path='../raw_data/', csv_name='political_dataset')
df = dp.load_dataset()

In [72]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,year,month,content
0,415,36361,2015: Sold Out South Carolina Tea Party Conven...,2015.0,1.0,"MYRTLE BEACH, South Carolina — The South Ca..."
1,417,57593,Narendra Modi Fast Facts,2015.0,1.0,(CNN) Here is a look at the life of India’s P...
2,418,59225,Little Richard Fast Facts,2015.0,1.0,(CNN) Here is a look at the life of ”Archit...
3,420,60219,"Cycling’s marathon man attempts 75,000 miles i...",2015.0,1.0,(CNN) While many people are recovering from a...
4,422,60223,Cops: Georgia police chief on leave after wife...,2015.0,1.0,(CNN) Magazines and websites regularly rank P...


In [82]:
docs = df['content'][2000:number_of_docs]

In [87]:
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
#stop_words += stopwords.words('portuguese')
stop_words = set(stop_words)
from nltk import word_tokenize

def remove_stopwords(text):
    tokenized = word_tokenize(text)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return ' '.join(without_stopwords)

docs = docs.apply(lambda x: remove_stopwords(x))

In [88]:
docs.head()

2000    It looks like Kentucky Gov . Matt Bevin ( R ) ...
2001    TPM pleased announce winners Ninth Annual Gold...
2002    Going back post yesterday US murder rate , I w...
2003    As I ’ mentioned number posts years , I believ...
2004    Pundits seem problem getting heads around . Do...
Name: content, dtype: object

In [110]:
from sentence_transformers import SentenceTransformer

# sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
sentence_model = SentenceTransformer("paraphrase-mpnet-base-v2")

topic_model = BERTopic(embedding_model=sentence_model, min_topic_size=20, language='english', calculate_probabilities=False, n_gram_range=(2,2))
topic_model.fit_transform(np.array(docs))

([-1,
  -1,
  1,
  1,
  3,
  10,
  0,
  -1,
  -1,
  -1,
  9,
  -1,
  -1,
  -1,
  -1,
  6,
  -1,
  -1,
  8,
  8,
  1,
  9,
  -1,
  -1,
  8,
  8,
  3,
  3,
  1,
  6,
  -1,
  1,
  1,
  0,
  9,
  -1,
  -1,
  -1,
  3,
  7,
  -1,
  7,
  -1,
  8,
  -1,
  1,
  -1,
  1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  1,
  -1,
  -1,
  -1,
  1,
  1,
  -1,
  -1,
  -1,
  1,
  9,
  1,
  -1,
  0,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  0,
  -1,
  -1,
  1,
  -1,
  0,
  -1,
  -1,
  -1,
  9,
  -1,
  -1,
  -1,
  0,
  9,
  -1,
  1,
  -1,
  10,
  -1,
  -1,
  -1,
  8,
  -1,
  -1,
  6,
  6,
  -1,
  10,
  -1,
  1,
  1,
  1,
  -1,
  0,
  3,
  -1,
  8,
  -1,
  1,
  0,
  0,
  0,
  9,
  -1,
  -1,
  -1,
  1,
  -1,
  -1,
  1,
  8,
  5,
  -1,
  -1,
  1,
  -1,
  6,
  10,
  11,
  -1,
  6,
  5,
  -1,
  1,
  -1,
  0,
  4,
  6,
  9,
  0,
  -1,
  0,
  -1,
  5,
  5,
  0,
  7,
  6,
  -1,
  0,
  0,
  1,
  0,
  1,
  5,
  0,
  0,
  2,
  -1,
  -1,
  -1,
  0,
  -1

In [112]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,378,-1_donald trump_united states_breitbart news_t...
1,0,112,0_islamic state_breitbart london_asylum seeker...
2,1,76,1_gun control_second amendment_background chec...
3,2,58,2_fox news_primary debate_news channel_gop fro...
4,3,55,3_bill clinton_hillary clinton_sexual assault_...
5,4,53,4_ted cruz_donald trump_sen ted_york values
6,5,49,5_academy awards_pinkett smith_jada pinkett_pe...
7,6,42,6_ted cruz_iowa caucus_trump cruz_among evange...
8,7,39,7_bernie sanders_hillary clinton_sanders says_...
9,8,38,8_republican party_schlafly said_billionaires ...


In [107]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,309,-1_donald trump_ted cruz_marco rubio_republica...
1,0,218,0_planned parenthood_breitbart news_zika virus...
2,1,115,1_hillary clinton_bill clinton_bernie sanders_...
3,2,108,2_islamic state_new year eve_breitbart london_...
4,3,81,3_academy awards_pinkett smith_people color_ja...
5,4,49,4_gun control_executive gun_awr hawkins twitte...
6,5,43,5_breitbart news_milo yiannopoulos_january 201...
7,6,30,6_hillary clinton_classified information_priva...
8,7,24,7_state union_nuclear deal_january 13 2016_sta...
9,8,23,8_natural born_born citizen_natural born citiz...


In [94]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,284,-1_ted cruz_marco rubio_republican presidentia...
1,0,240,0_donald trump_united states_planned parenthoo...
2,1,118,1_islamic state_new year_year eve_new year eve
3,2,117,2_hillary clinton_bill clinton_bernie sanders_...
4,3,81,3_academy awards_pinkett smith_jada pinkett sm...
5,4,49,4_gun control_awr hawkins_executive gun_follow...
6,5,35,5_breitbart news_milo yiannopoulos_breitbart t...
7,6,30,6_hillary clinton_private server_secretary sta...
8,7,23,7_state union_nuclear deal_january 13 2016_uni...
9,8,23,8_natural born_born citizen_natural born citiz...


In [95]:
# https://intellica-ai.medium.com/comparison-of-different-word-embeddings-on-text-similarity-a-use-case-in-nlp-e83e08469c1c
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_similarity(feature_vec_1, feature_vec_2):
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

In [100]:
%%time
#Testar distâncias diferentes
emb = topic_model.embedding_model
vec = emb.embedding_model.encode(np.array(docs))


CPU times: user 1min 36s, sys: 7.55 s, total: 1min 43s
Wall time: 18.9 s


In [101]:
vec

array([[-0.05773637,  0.22130534,  0.28068772, ..., -0.02574816,
         0.3718867 ,  0.19233046],
       [ 0.12606996,  0.03008237,  0.06505708, ..., -0.41795808,
         0.01794901,  0.12056918],
       [ 0.38137358, -0.09893577, -0.29961956, ..., -0.36677417,
        -0.29485905, -0.06710646],
       ...,
       [-0.24397522, -0.2544394 , -0.01839405, ...,  0.09266283,
        -0.05090453, -0.06360812],
       [ 0.3810828 , -0.08977187, -0.17027706, ...,  0.32137737,
        -0.14986171,  0.15133472],
       [-0.08440732, -0.14206052, -0.05113106, ...,  0.02968092,
         0.00608663,  0.12419046]], dtype=float32)

In [103]:
doc_sim_matrix = cosine_similarity(vec,vec)

In [96]:
topic_model.embedding_model

<bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f28082acca0>

In [None]:
df_topics_info = generate_topic_info(topic_model, i)


In [None]:
%%time

can_execute = False

if can_execute:
    number_of_docs = 3000
    number_of_docs_back = 1500
    number_of_iterations = df.shape[0]//number_of_docs

    start_pos = 0
    end_pos = number_of_docs-1
    for i in range(0, number_of_iterations):
        if i == (number_of_iterations-1):
            if end_pos < (df.shape[0]-1):
                end_pos += (df.shape[0]-1) - end_pos

        if i > 0:
            df_docs = df[(start_pos-number_of_docs_back):end_pos].sort_values(by='date', ascending=True).reset_index(drop=True)
        else:
            df_docs = df[start_pos:end_pos].sort_values(by='date', ascending=True).reset_index(drop=True)

        #if i > 12:    
        docs = df_docs['content'].values
        print('starting transform...')
        topic_model = BERTopic(min_topic_size=30, language='english', calculate_probabilities=False, n_gram_range=(2,2))
        topic_model.fit_transform(docs)
        #print(len(topics))
        #print(topics)
        #break
        topic_model.save('../raw_data/BERTopic_model_2_2_run_'+str(i))
        print('end transform...')

        df_docs.to_csv(f'../raw_data/BERTopicDocsContent_{str(i)}.csv', header=True, index=False, encoding='utf-8')

        df_topics_info = generate_topic_info(topic_model, i)

        df_terms = generate_terms(topic_model, i)

        #df_topic_similarity = generate_topic_similarity(topic_model, i)

        #df_topic_documents = generate_topic_documents(topic_model, i)

        #matrix_documents_similarity = generate_documents_similarity(topic_model, docs, i)

        start_pos += number_of_docs
        end_pos += number_of_docs

        docs = None
        df_docs = None
        df_topics_info = None
        df_terms = None
        df_topic_similarity = None
        df_topic_documents = None
        matrix_documents_similarity = None

        print(f'done {str(i)} of {str(number_of_iterations-1)}')

## Check term

In [None]:
%%time

number_of_iterations = 16

models = []
for i in range(0,number_of_iterations):
    topic_model = BERTopic.load('../raw_data/BERTopic_model_2_2_run_'+str(i))
    models.append(topic_model)

In [None]:
%%time

top_n_sim = 50
search_terms = ['trump', 'climate change', 'biden', 'hillary']

term_similarity_columns = ['mode_index', 'bert_model', 'search_term', 'topic', 'topic_docs', 'topic_start_date', 'topic_end_date', 'similarity']
df_term_similarity = pd.DataFrame(columns=term_similarity_columns)

model_ind = 0
for topic_model in models:
    topic_ind = -1010
    for search_term in search_terms:
        similar_topics, similarity = topic_model.find_topics(search_term, top_n=top_n_sim)
        if len(similar_topics) > 0:
            for i in range(0,len(similar_topics)):
                if similarity[i] < 0.7:
                    break
                if topic_ind != similar_topics[i]:
                    start_date, end_date, number_topic_docs = get_topic_start_end_dates(model_ind, similar_topics[i])
                    topic_ind = similar_topics[i]
                
                new_term_sim = {}
                new_term_sim['mode_index'] = model_ind
                new_term_sim['bert_model'] = 'BERTopic_model_2_2_run_'+str(model_ind)
                new_term_sim['search_term'] = search_term
                new_term_sim['topic'] = similar_topics[i]
                new_term_sim['topic_docs'] = number_topic_docs
                new_term_sim['topic_start_date'] = start_date
                new_term_sim['topic_end_date'] = end_date
                new_term_sim['similarity'] = round(similarity[i],6)
                df_term_similarity = df_term_similarity.append(new_term_sim, ignore_index=True)
    model_ind += 1

df_term_similarity = df_term_similarity.sort_values(by=['search_term', 'topic_start_date'], ascending=True).reset_index(drop=True)
df_term_similarity.to_csv(f'../raw_data/BERTopicTermModelSimilarity.csv', header=True, index=False, encoding='utf-8')
print(df_term_similarity.shape)

In [None]:
df_term_similarity

In [None]:
# search_terms = ['trump', 'climate change', 'biden', 'hillary']
df_term_similarity[df_term_similarity['search_term']=='climate change']

In [None]:
parei aqui

## Validate n_gram

In [None]:
dp = DataProcessor(csv_path='../raw_data/', csv_name='dataset_work')
df = dp.load_dataset()
df = df.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)

In [None]:
df_ = df[df['year'] == 2015].copy()
df_

In [None]:
%%time

n_grams = []
n_grams.append((1,1))
n_grams.append((1,2))
n_grams.append((1,3))
n_grams.append((1,4))
n_grams.append((2,1))
n_grams.append((2,2))
n_grams.append((2,3))
n_grams.append((2,4))
n_grams.append((3,1))
n_grams.append((3,2))
n_grams.append((3,3))
n_grams.append((3,4))
n_grams.append((4,1))
n_grams.append((4,2))
n_grams.append((4,3))
n_grams.append((4,4))
for n_gram in n_grams:
    n_gram_txt = f'{str(n_gram[0])}_{str(n_gram[1])}'

    df_docs = df_.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)
    docs = df_docs['content'].values
    print('starting transform...')
    topic_model = BERTopic(min_topic_size=30, language='english', calculate_probabilities=False, n_gram_range=n_gram)
    topic_model.fit_transform(docs)

    topic_model.save('../raw_data/BERTopic_model_'+str(n_gram_txt))
    print('end transform...')

    df_docs.to_csv(f'../raw_data/BERTopicDocsContent_{n_gram_txt}.csv', header=True, index=False, encoding='utf-8')

    df_topics_info = generate_topic_info(topic_model, n_gram_txt)
    
    df_terms = generate_terms(topic_model, n_gram_txt)

    del docs
    del df_docs
    del df_topics_info
    del df_terms

    print(f'done n_gram: {n_gram_txt}')

In [None]:
icount = 0
for n_gram in n_grams:            
    n_gram_txt = f'{str(n_gram[0])}_{str(n_gram[1])}'
    df_topic_info = pd.read_csv(f'../raw_data/BERTopicInfo_{n_gram_txt}.csv')
    df_topic_info = df_topic_info.head(1).copy()
    df_topic_info['n_gram'] = n_gram_txt
    if icount == 0:
        df_topic_nones = df_topic_info.copy()
    else:
        df_topic_nones = df_topic_nones.append(df_topic_info, ignore_index=True)
        
    del df_topic_info
    icount += 1

df_topic_nones.to_csv('../raw_data/BERTopicResult_n_gram.csv', header=True, index=False, encoding='utf-8')
df_topic_nones

In [None]:
df_topic_nones = pd.read_csv('../raw_data/BERTopicResult_n_gram.csv')
df_topic_nones