In [1]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic


In [2]:
read_path = r"./second_iteration_files" 
questions_path = os.path.join(read_path, "filtered_questions.csv")
answers_path = os.path.join(read_path, "answers_table_before_bertopic.csv")
df_questions = pd.read_csv(questions_path)
df_answers = pd.read_csv(answers_path)

1. run bertopic on answers

In [3]:
df_answers

Unnamed: 0.2,Unnamed: 0.1,Answer,followerCount,name,upvoteCount,answerCount,answerViews,Unnamed: 0,question
0,6,This is a square:image from wikimediaWhy am I ...,"9,554 followers",Glenn Luk,6.6K,427,7.9M,6.0,https://www.quora.com/What-are-some-reasons-ot...
1,7,It is more a question of nomenclature than of ...,"3,120 followers",Becca Royal-Gordon,1.1K,4.3K,12M,7.0,https://www.quora.com/What-are-some-reasons-ot...
2,8,"Cray vector Computers through the cray C90, an...","9,921 followers",Brett Bergan,1.3K,7K,82.9M,8.0,https://www.quora.com/What-are-some-reasons-ot...
3,12,Dynamic Programming Practice ProblemsDynamic P...,"224,569 followers",Gayle Laakmann McDowell,1K,1.2K,49.7M,12.0,https://www.quora.com/What-are-some-reasons-ot...
4,13,Dynamic programming is a very specific topic i...,817 followers,Sumeet Raj Thakker,32.1K,0,0,13.0,https://www.quora.com/What-are-some-reasons-ot...
...,...,...,...,...,...,...,...,...,...
1870,12,I had to watch Interstellar two times to figur...,UNKNOWN,Robert Frost,928,9.4K,195.4M,12.0,https://www.quora.com/Why-cant-todays-supercom...
1871,13,I’ll try to answer one question at a time.I’ll...,12 followers,Husain Khambaty,808,106,2M,13.0,https://www.quora.com/Why-cant-todays-supercom...
1872,15,Because it is entirely a legend. China gave st...,UNKNOWN,James A McCoy Jr.,918,234,318.2K,15.0,https://www.quora.com/Why-cant-todays-supercom...
1873,16,“Forced” ? Like putting a gun to an American e...,18 followers,Asim Qureshi,12.7K,996,155.4M,16.0,https://www.quora.com/Why-cant-todays-supercom...


In [4]:
#remove nan rows in Answer column
df_answers = df_answers[df_answers['Answer'].isnull() ==False]

In [5]:
df_answers

Unnamed: 0.2,Unnamed: 0.1,Answer,followerCount,name,upvoteCount,answerCount,answerViews,Unnamed: 0,question
0,6,This is a square:image from wikimediaWhy am I ...,"9,554 followers",Glenn Luk,6.6K,427,7.9M,6.0,https://www.quora.com/What-are-some-reasons-ot...
1,7,It is more a question of nomenclature than of ...,"3,120 followers",Becca Royal-Gordon,1.1K,4.3K,12M,7.0,https://www.quora.com/What-are-some-reasons-ot...
2,8,"Cray vector Computers through the cray C90, an...","9,921 followers",Brett Bergan,1.3K,7K,82.9M,8.0,https://www.quora.com/What-are-some-reasons-ot...
3,12,Dynamic Programming Practice ProblemsDynamic P...,"224,569 followers",Gayle Laakmann McDowell,1K,1.2K,49.7M,12.0,https://www.quora.com/What-are-some-reasons-ot...
4,13,Dynamic programming is a very specific topic i...,817 followers,Sumeet Raj Thakker,32.1K,0,0,13.0,https://www.quora.com/What-are-some-reasons-ot...
...,...,...,...,...,...,...,...,...,...
1870,12,I had to watch Interstellar two times to figur...,UNKNOWN,Robert Frost,928,9.4K,195.4M,12.0,https://www.quora.com/Why-cant-todays-supercom...
1871,13,I’ll try to answer one question at a time.I’ll...,12 followers,Husain Khambaty,808,106,2M,13.0,https://www.quora.com/Why-cant-todays-supercom...
1872,15,Because it is entirely a legend. China gave st...,UNKNOWN,James A McCoy Jr.,918,234,318.2K,15.0,https://www.quora.com/Why-cant-todays-supercom...
1873,16,“Forced” ? Like putting a gun to an American e...,18 followers,Asim Qureshi,12.7K,996,155.4M,16.0,https://www.quora.com/Why-cant-todays-supercom...


In [6]:
ans_text = df_answers['Answer']

In [7]:
ans_text = ans_text.to_list()

In [8]:
#create topic model without stop words
vectorizer_model = CountVectorizer(stop_words="english")

topic_model = BERTopic(vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(ans_text)



In [9]:
topic_model.get_topic_info() 

Unnamed: 0,Topic,Count,Name
0,-1,288,-1_people_like_just_moon
1,0,720,0_fusion_energy_nuclear_power
2,1,183,1_cryptocurrency_crypto_money_market
3,2,86,2_iot_internet_devices_things
4,3,62,3_rocket_engines_engine_speed
5,4,51,4_computers_memory_supercomputer_quantum
6,5,45,5_mining_bitcoin_miners_hardware
7,6,42,6_brain_didnt_parents_told
8,7,40,7_china_chinese_trade_american
9,8,37,8_musk_elon_tesla_spacex


In [10]:
topic_model.get_topic(0)

[('fusion', 0.04330904014075987),
 ('energy', 0.027968241972122006),
 ('nuclear', 0.02650575653010653),
 ('power', 0.02472774139936822),
 ('reactor', 0.021314431583450234),
 ('fission', 0.020122772937801063),
 ('reactors', 0.01793406088950095),
 ('fuel', 0.016117834580117667),
 ('plasma', 0.014529775883755787),
 ('reaction', 0.012357160237739166)]

In [11]:
#The top n terms per topic and their respective c-TF-IDF values
topic_rep_dict = topic_model.topic_representations_

topic_rep_labels = topic_model.topic_labels_
topic_rep_docs = topic_model.representative_docs_

In [12]:
topic_rep_labels

{-1: '-1_people_like_just_moon',
 0: '0_fusion_energy_nuclear_power',
 1: '1_cryptocurrency_crypto_money_market',
 2: '2_iot_internet_devices_things',
 3: '3_rocket_engines_engine_speed',
 4: '4_computers_memory_supercomputer_quantum',
 5: '5_mining_bitcoin_miners_hardware',
 6: '6_brain_didnt_parents_told',
 7: '7_china_chinese_trade_american',
 8: '8_musk_elon_tesla_spacex',
 9: '9_fusion_energy_confinement_magnetic',
 10: '10_youre_net_garagebuild_outif',
 11: '11_sun_interior_300000_want',
 12: '12_india_budget_indian_tax',
 13: '13_space_galaxy_universe_zoom',
 14: '14_iot_data_technology_stack',
 15: '15_remarkable_core_sun_scale',
 16: '16_thanks_question_meanif_gets',
 17: '17_python_dp_programming_language',
 18: '18_invention_going_reality_virtual',
 19: '19_crypto_dont_soif_thatjust',
 20: '20_bitcoin_green_cryptocurrencies_crypto',
 21: '21_mars_earth_speed_travel',
 22: '22_billionaires_oceans_breakthrough_technological'}

In [13]:
pip install --upgrade six

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
--- Logging error ---
Traceback (most recent call last):
  File "/home/ravit/.local/lib/python3.9/site-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/home/ravit/.local/lib/python3.9/site-packages/pip/_vendor/rich/console.py", line 1673, in print
    extend(render(renderable, render_options))
  File "/home/ravit/.local/lib/python3.9/site-packages/pip/_vendor/rich/console.py", line 1305, in render
    for render_output in iter_render:
  File "/home/ravit/.local/lib/python3.9/site-packages/pip/_internal/utils/logging.py", line 134, in 

In [14]:
#save dicts

with open('answers_topic_representations_second_iteration', 'w') as f:
    json.dump(topic_rep_dict, f)


with open('answers_topic_rep_labels_second_iteration', 'w') as f:
    json.dump(topic_rep_labels, f)
 

In [15]:
pd.DataFrame(topic_rep_docs).T

Unnamed: 0,0,1,2
4,"Cray vector Computers through the cray C90, an...","The correct answer is ""finish in ample time"".T...","They are infinitely faster, but they don't exi..."
17,There is a terrific demand for embedded progra...,Electrical Engineering and Computer Science st...,They graduate with a CS degree. Then they eith...
0,Depends on its operating temperature. Between ...,"Well, everything that works “never worked” unt...",I think that Cleanliness is a relative word. F...
5,If you compare with investing in Bitcoin buyin...,Bitcoin mining is a great way to earn cryptocu...,Bitcoin mining is a great way to earn cryptocu...
19,"Simple answer?NO!If you don't know, in depth, ...",It depends on a lot of factors - but if you ha...,"Simple answer?NO!If you don't know, in depth, ..."
14,Business and IT are interrelated to a reasonab...,"Due to our innate fascination with novelty, pe...",The Internet has already changed the world in ...
18,Not specific question - some guys here point t...,I think it's finally going to be robots.A bunc...,"Robotic Process Automation, HyperAutomation, A..."
2,The Sky is the limit.There is no place or fiel...,With the emergence of IoT (Internet of Things)...,The Sky is the limit.There is no place or fiel...
22,If you could create aan AI system that uses ma...,This would have to be in the following field.I...,I’m going to talk about the idea of blue ocean...
16,Thanks for the A2A.2.3.Thanks for the A2A.2.3....,And some that haven't:,Thanks for the A2A. There really isn't anythin...


In [16]:
df_rep_docs = pd.DataFrame(topic_rep_docs).T

In [17]:
#outliers are excluded from examples
set(df_rep_docs.index)

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22}

In [18]:
df_rep_docs.sort_index(inplace=True)

In [19]:
df_rep_docs.to_csv('answers_representative_sentences_second_iter.csv')

2. bertopic on questions


In [20]:
df_questions

Unnamed: 0.1,Unnamed: 0,question
0,6,https://www.quora.com/What-are-some-reasons-ot...
1,7,https://www.quora.com/What-are-some-reasons-ot...
2,8,https://www.quora.com/What-are-some-reasons-ot...
3,12,https://www.quora.com/What-are-some-reasons-ot...
4,13,https://www.quora.com/What-are-some-reasons-ot...
...,...,...
1870,12,https://www.quora.com/Why-cant-todays-supercom...
1871,13,https://www.quora.com/Why-cant-todays-supercom...
1872,15,https://www.quora.com/Why-cant-todays-supercom...
1873,16,https://www.quora.com/Why-cant-todays-supercom...


In [22]:
#remove nan rows in Answer column
df_questions = df_questions[df_questions['question'].isnull() ==False]

In [23]:
df_questions

Unnamed: 0.1,Unnamed: 0,question
0,6,https://www.quora.com/What-are-some-reasons-ot...
1,7,https://www.quora.com/What-are-some-reasons-ot...
2,8,https://www.quora.com/What-are-some-reasons-ot...
3,12,https://www.quora.com/What-are-some-reasons-ot...
4,13,https://www.quora.com/What-are-some-reasons-ot...
...,...,...
1870,12,https://www.quora.com/Why-cant-todays-supercom...
1871,13,https://www.quora.com/Why-cant-todays-supercom...
1872,15,https://www.quora.com/Why-cant-todays-supercom...
1873,16,https://www.quora.com/Why-cant-todays-supercom...


In [24]:
ques_text = df_questions['question']

In [27]:
ques_text = ques_text.to_list()

In [36]:
#remove https and www from questions keep
new_questions = []
for q in ques_text:
    q = q.replace('https://www.quora.com/','')
    q = q.replace('-', ' ')
    new_questions.append(q)


In [38]:
#create topic model without stop words
vectorizer_model = CountVectorizer(stop_words="english")

topic_model = BERTopic(vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(new_questions)

In [39]:
topic_model.get_topic_info() 

Unnamed: 0,Topic,Count,Name
0,-1,48,-1_embedded_systems_non_blockchain
1,0,57,0_generation_gaming_initiatives_screams
2,1,43,1_outer_modify_jet_function
3,2,39,2_technologically_manage_humans_destroy
4,3,37,3_large_scale_production_viable
...,...,...,...
82,81,12,81_stop_destroying_thermonuclear_planet
83,82,12,82_minimal_barriers_product_look
84,83,12,83_friendly_eco_currency_green
85,84,11,84_definition_internet_things_theory


In [44]:
topic_model.get_topic(10)

[('effectively', 0.31570784821965975),
 ('doing', 0.18661823056472043),
 ('apollo', 0.18661823056472043),
 ('work', 0.1677333853126321),
 ('contribute', 0.09191058637653123),
 ('wendelstein', 0.09191058637653123),
 ('successful', 0.09191058637653123),
 ('regarding', 0.09191058637653123),
 ('humanity', 0.09191058637653123),
 ('germanys', 0.09191058637653123)]

In [45]:
#The top n terms per topic and their respective c-TF-IDF values
q_topic_rep_dict = topic_model.topic_representations_

q_topic_rep_labels = topic_model.topic_labels_
q_topic_rep_docs = topic_model.representative_docs_

In [46]:
#save dicts

with open('questions_representations_second_iteration', 'w') as f:
    json.dump(q_topic_rep_dict, f)


with open('questions_rep_labels_second_iteration', 'w') as f:
    json.dump(q_topic_rep_labels, f)
 

In [47]:
pd.DataFrame(q_topic_rep_docs).T

Unnamed: 0,0,1,2
56,What are some reasons other than cost to not r...,Why do we need a 64bit processor Would it be p...,What are some reasons other than cost to not r...
19,unanswered/What are the pros and cons of using...,unanswered/What is the best way to use solar e...,unanswered/What are the pros and cons of using...
49,unanswered/Can you make money with solar power...,unanswered/Can you make money with solar power...,unanswered/Can you make money with solar power...
28,How can I mine Bitcoin with my car,How can I mine Bitcoin with my car,How can I mine Bitcoin with my car
68,Do you know a way to get free cryptocurrency,Do you know a way to get free cryptocurrency,Do you know a way to get free cryptocurrency
...,...,...,...
74,What do you think the future on Earth will loo...,What do you think the future on Earth will loo...,What do you think the future on Earth will loo...
40,What are some good investment strategies that ...,What are some good investment strategies that ...,What are some good investment strategies that ...
63,In layman s terms what is spatial computing an...,In layman s terms what is spatial computing an...,In layman s terms what is spatial computing an...
2,How much more technologically advanced was the...,How much more technologically advanced was the...,If humans manage not to destroy themselves wha...


In [50]:
df_q_rep_docs = pd.DataFrame(topic_rep_docs).T

In [51]:

df_q_rep_docs.to_csv('questions_representative_sentences_second_iter.csv')