In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%run -i "../util/lang_utils.ipynb"

In [3]:
stop_words = stopwords.words('english')
stop_words.append("said")
stop_words.append("mr")
bbc_df = pd.read_csv("../data/bbc-text.csv")

In [4]:
bbc_df["text"] = bbc_df["text"].apply(lambda x: word_tokenize(x))
bbc_df["text"] = bbc_df["text"].apply(lambda x: [w for w in x if w not in stop_words])
bbc_df["text"] = bbc_df["text"].apply(lambda x: " ".join(x))

In [5]:
from sklearn.model_selection import train_test_split


In [6]:
bbc_train, bbc_test = train_test_split(bbc_df, test_size=0.1)
print(len(bbc_train))
print(len(bbc_test))

2002
223


In [7]:
docs = bbc_train["text"].values

In [8]:
topic_model = BERTopic(nr_topics=6)
topics, probs = topic_model.fit_transform(docs)

In [9]:
print(topic_model.get_topic_info())

   Topic  Count                              Name  \
0     -1    382   -1_would_people_government_also   
1      0    463          0_england_game_win_first   
2      1    345          1_film_best_music_awards   
3      2    318          2_us_year_growth_company   
4      3    298  3_people_mobile_technology_games   
5      4    196     4_labour_would_election_party   

                                      Representation  \
0  [would, people, government, also, new, year, u...   
1  [england, game, win, first, club, world, cup, ...   
2  [film, best, music, awards, show, year, award,...   
3  [us, year, growth, company, oil, economy, mark...   
4  [people, mobile, technology, games, digital, m...   
5  [labour, would, election, party, blair, govern...   

                                 Representative_Docs  
0  [blair backs pre-election budget tony blair ba...  
1  [year remember irish used one subliminal momen...  
2  [scissor sisters triumph brits us band scissor...  
3  [battered do

In [10]:
print(topic_model.get_topic(0))

[('england', 0.023882879830085108), ('game', 0.022824291952794805), ('win', 0.021422342095741027), ('first', 0.01881690915579156), ('club', 0.01842068028302617), ('world', 0.01729838849979552), ('cup', 0.017288295887117396), ('last', 0.01677588343337484), ('players', 0.016386487687859514), ('two', 0.016129300776117832)]


In [11]:
print(topic_model.get_topic(1))

[('film', 0.04745498817893915), ('best', 0.03934320902746103), ('music', 0.025786629436500442), ('awards', 0.022804999411063932), ('show', 0.0220996115801044), ('year', 0.020093915306918897), ('award', 0.019662224688114443), ('also', 0.01950071915962781), ('us', 0.01898980147684509), ('one', 0.018728960364724132)]


In [12]:
print(topic_model.get_topic(2))

[('us', 0.03442785652146797), ('year', 0.02285801104946136), ('growth', 0.020083672012198246), ('company', 0.01927454953163546), ('oil', 0.018494142807536872), ('economy', 0.018430934948056105), ('market', 0.017972873698366133), ('economic', 0.01640454047459571), ('also', 0.016050564522382082), ('firm', 0.015960979404641)]


In [13]:
print(topic_model.get_topic(3))

[('people', 0.030169913063230375), ('mobile', 0.02436720294319817), ('technology', 0.02196414658665488), ('games', 0.020674472434570275), ('digital', 0.018445582384291344), ('music', 0.018081527661181953), ('software', 0.018060108655872854), ('users', 0.01794705313066185), ('one', 0.017469592810325858), ('microsoft', 0.016636415635247488)]


In [14]:
print(topic_model.get_topic(4))

[('labour', 0.04642588696549814), ('would', 0.040303135646170725), ('election', 0.03670207291976916), ('party', 0.035440804065683146), ('blair', 0.034108498926513), ('government', 0.03300809240953809), ('brown', 0.024968597565362342), ('people', 0.023719549728019693), ('minister', 0.023108783090207228), ('prime', 0.020058421585192977)]


In [15]:
topic_model.generate_topic_labels(nr_words=5, topic_prefix=True, separator='_')

['-1_would_people_government_also_new',
 '0_england_game_win_first_club',
 '1_film_best_music_awards_show',
 '2_us_year_growth_company_oil',
 '3_people_mobile_technology_games_digital',
 '4_labour_would_election_party_blair']

In [16]:
def get_prediction(input_text, model):
    pred = model.transform(input_text)
    pred = pred[0][0]
    return pred

In [17]:
bbc_test["prediction"] = bbc_test["text"].apply(lambda x: get_prediction(x, topic_model))
topic_mapping = {0:"sport", 1:"politics", 2:"entertainment", 3:"tech", 4:"business", -1:"discard"}

In [18]:
bbc_test["pred_category"] = bbc_test["prediction"].apply(lambda x: topic_mapping[x])
test_data = bbc_test.loc[bbc_test['prediction'] != -1]
print(classification_report(test_data["category"], test_data["pred_category"]))

               precision    recall  f1-score   support

     business       0.00      0.00      0.00        18
entertainment       0.00      0.00      0.00        32
     politics       0.00      0.00      0.00        12
        sport       0.97      1.00      0.98        60
         tech       0.95      0.95      0.95        21

     accuracy                           0.56       143
    macro avg       0.38      0.39      0.39       143
 weighted avg       0.55      0.56      0.55       143



In [19]:
new_input = bbc_test["text"].iloc[0]
print(new_input)

us top supercomputing charts us pushed japan top supercomputing chart ibm prototype blue gene/l machine . assembled lawrence livermore national laboratory us department energy . ibm test results show blue gene/l managed speeds 70.72 teraflops . previous top machine japan nec earth simulator clocked 35.86. top 500 list announced monday officially charts fastest computers world . announced every six months worked using officially recognised mathematical speed test called linpack measures calculations per second . completed 2005 blue gene/l powerful current prototype . next year final blue gene four times year going real step hard beat erich strohmaier one co-founders top500 list . help scientists work safety security reliability requirements us nuclear weapons stockpile without need underground nuclear testing . also cut amount heat generated massive power big problem supercomputers . second place silicon graphics columbia supercomputer based us space agency ( nasa ) ames research center

In [20]:
print(topic_model.transform(new_input))

([3], array([0.39134575]))


In [21]:
topics, similarity = topic_model.find_topics("sports", top_n=5)
sim_topics = list(zip(topics, similarity))
print(sim_topics)

[(0, 0.2894483034907077), (3, 0.05359171384613123), (1, -0.01738426625881717), (2, -0.03615969447429008), (-1, -0.044434461546758564)]


In [22]:
topics, similarity = topic_model.find_topics("business and economics", top_n=5)
sim_topics = list(zip(topics, similarity))
print(sim_topics)

[(2, 0.2784268133891955), (-1, 0.18159661798947715), (3, 0.15479618553333446), (4, 0.048951239146058964), (0, 0.009589394053705187)]


In [23]:
input_text = """YouTube removed a snippet of code that publicly disclosed whether a channel receives ad payouts, 
obscuring which creators benefit most from the platform."""
topics, similarity = topic_model.find_topics(input_text, top_n=5)
sim_topics = list(zip(topics, similarity))
print(sim_topics)

[(3, 0.26466095795295136), (1, 0.1491637613404677), (-1, 0.12320145493528459), (2, 0.10905987107405501), (4, 0.07399604595045894)]
