In [1]:
from bertopic import BERTopic
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import pandas as pd
from pathlib import Path
import hopsworks

# Special thanks to Maarten Grootendorst https://maartengr.github.io/BERTopic/index.html
# Even though I had already made a working version of the same architecture before finding this library,
# it allowed for easy visualizations and allows for neat code that I don't have to maintain myself

In [2]:
USE_HOPSWORKS = True

if USE_HOPSWORKS:
    # Get data from feature store
    project = hopsworks.login()
    feature_store = project.get_feature_store()
    article_feature_group = feature_store.get_feature_group(name="articles_daily_cleaned", version=1)
    data = article_feature_group.read()
else:
    data_path = ( Path.cwd() / "data/RoundedCleanedArticles.csv").resolve()
    data = pd.read_csv(data_path, encoding='utf-8')

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/5270




Connected. Call `.close()` to terminate connection gracefully.
2022-12-25 10:07:21,025 INFO: USE `scalablemltask1_featurestore`
2022-12-25 10:07:21,534 INFO: SELECT `fg0`.`title` `title`, `fg0`.`url` `url`, `fg0`.`publishedat` `publishedat`, `fg0`.`title_stance` `title_stance`, `fg0`.`title_topic` `title_topic`
FROM `scalablemltask1_featurestore`.`articles_daily_cleaned_1` `fg0`


In [3]:
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = sentence_model.encode(data['title_topic'])

cluster_model = AgglomerativeClustering(linkage='ward', distance_threshold=1.5, n_clusters=None)
topic_model = BERTopic(hdbscan_model=cluster_model).fit(data['title_topic'], embeddings)
topics, probs = topic_model.fit_transform(data['title_topic'])

topic_labels = topic_model.generate_topic_labels(nr_words=3,
                                                 topic_prefix=False,
                                                 word_length=15,
                                                 separator=", ")
topic_model.set_topic_labels(topic_labels)
# topic_model.save('topic_model')

topic_model.get_topic_info()

2022-12-25 10:07:23,204 INFO: Load pretrained SentenceTransformer: paraphrase-multilingual-mpnet-base-v2
2022-12-25 10:07:25,874 INFO: Use pytorch device: cpu


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2022-12-25 10:07:31,186 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2022-12-25 10:07:31,404 INFO: Use pytorch device: cpu


Unnamed: 0,Topic,Count,Name,CustomName
0,0,11,0_winter_storm_snow_cnn,"winter, storm, snow"
1,1,8,1_arizona_lake_kari_election,"arizona, lake, kari"
2,2,7,2_nfl_16_week_vs,"nfl, 16, week"
3,3,7,3_bbc_english_space_strikes,"bbc, english, space"
4,4,6,4_risk_verge_250_mass,"risk, verge, 250"
5,5,4,5_suspension_bauer_trevor_espn,"suspension, bauer, trevor"
6,6,3,6_women_taliban_lanez_support,"women, taliban, lanez"
7,7,3,7_yahoo_voices_entertainment_fascinated,"yahoo, voices, entertainment"


In [4]:
topic_labels_series = pd.Series(topic_labels)
docs_topic = topic_labels_series[topics].tolist()
data['predicted_topic'] = docs_topic
display(data)

if USE_HOPSWORKS:
    # Save result to feature_store
    article_cleaned_feature_store = feature_store.get_or_create_feature_group(
        name="articles_topic",
        version=1,
        primary_key=["url"],
        description="Articles with predicted topic")
    article_cleaned_feature_store.insert(data, write_options={"wait_for_job" : False})
else:
    data_path = ( Path.cwd() / "data/RoundedWithTopic.csv").resolve()
    data.to_csv(data_path, encoding='utf-8', index=False)

Unnamed: 0,title,url,publishedat,title_stance,title_topic,predicted_topic
0,Taliban minister defends closing universities ...,https://www.theguardian.com/world/2022/dec/23/...,2022-12-23T01:50:00Z,taliban minister defends closing universities ...,taliban minister defends closing universities ...,"women, taliban, lanez"
1,Russia considers ways to return space crew aft...,https://www.aljazeera.com/news/2022/12/23/russ...,2022-12-23T03:59:49Z,russia considers ways to return space crew aft...,russia considers ways return space crew capsul...,"bbc, english, space"
2,What could power grid 'rotating outages' look ...,https://abc6onyourside.com/news/local/power-el...,2022-12-24T22:44:53Z,what could power grid rotating outages look li...,could power grid rotating outages look like ab...,"winter, storm, snow"
3,A powerful winter storm claims at least 22 liv...,https://www.cnn.com/2022/12/24/weather/christm...,2022-12-25T02:54:00Z,a powerful winter storm claims at least 22 liv...,powerful winter storm claims least 22 lives ac...,"winter, storm, snow"
4,James Webb telescope: Amazing images show the ...,https://news.yahoo.com/james-webb-telescope-am...,2022-12-25T00:22:00Z,james webb telescope amazing images show the u...,james webb telescope amazing images show unive...,"yahoo, voices, entertainment"
5,Elon Musk Warns Against Margin Debt on Risk of...,https://finance.yahoo.com/news/elon-musk-warns...,2022-12-24T23:51:44Z,elon musk warns against margin debt on risk of...,elon musk warns margin debt risk market mass p...,"risk, verge, 250"
6,Jan. 6 House committee releases final report o...,https://www.cnbc.com/2022/12/22/trump-capitol-...,2022-12-23T02:48:00Z,jan 6 house committee releases final report on...,jan 6 house committee releases final report tr...,"risk, verge, 250"
7,NASA Explores a Winter Wonderland on Mars – Ot...,https://scitechdaily.com/nasa-explores-a-winte...,2022-12-24T22:25:39Z,nasa explores a winter wonderland on mars othe...,nasa explores winter wonderland mars otherworl...,"winter, storm, snow"
8,Dodgers' Trevor Bauer reinstated after suspens...,https://www.espn.com/mlb/story/_/id/35306975/d...,2022-12-23T04:24:35Z,dodgers trevor bauer reinstated after suspensi...,dodgers trevor bauer reinstated suspension cut...,"suspension, bauer, trevor"
9,Paris shooting: Aftermath of violent unrest fo...,https://www.bbc.com/news/av/world-europe-64087161,2022-12-24T22:09:40Z,paris shooting aftermath of violent unrest fol...,paris shooting aftermath violent unrest follow...,"bbc, english, space"


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/5270/fs/5190/fg/9576


Uploading Dataframe: 0.00% |          | Rows 0/49 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/5270/jobs/named/articles_topic_1_offline_fg_backfill/executions


In [7]:
topic_model.visualize_documents(data['title_topic'])

Batches:   0%|          | 0/2 [00:00<?, ?it/s]


distutils Version classes are deprecated. Use packaging.version instead.



In [16]:
topic_model.visualize_hierarchy(hierarchical_topics=topic_model.hierarchical_topics(data['title_topic']))

100%|██████████| 7/7 [00:00<00:00, 420.26it/s]

scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


distutils Version classes are deprecated. Use packaging.version instead.



In [17]:
topic_model.visualize_barchart()


distutils Version classes are deprecated. Use packaging.version instead.



In [21]:
topic_model.visualize_heatmap()


distutils Version classes are deprecated. Use packaging.version instead.

