In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%run -i "../util/lang_utils.ipynb"

In [3]:
bbc_df = pd.read_csv("../data/bbc-text.csv")
print(bbc_df)

           category                                               text
0              tech  tv future in the hands of viewers with home th...
1          business  worldcom boss  left books alone  former worldc...
2             sport  tigers wary of farrell  gamble  leicester say ...
3             sport  yeading face newcastle in fa cup premiership s...
4     entertainment  ocean s twelve raids box office ocean s twelve...
...             ...                                                ...
2220       business  cars pull down us retail figures us retail sal...
2221       politics  kilroy unveils immigration policy ex-chatshow ...
2222  entertainment  rem announce new glasgow concert us band rem h...
2223       politics  how political squabbles snowball it s become c...
2224          sport  souness delight at euro progress boss graeme s...

[2225 rows x 2 columns]


In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(bbc_df["text"], convert_to_tensor=True)



In [5]:
clusters = util.community_detection(embeddings, threshold=0.7, min_community_size=10)
print(clusters)

[[1553, 2059, 192, 1208, 493, 827, 1594, 1082, 516, 1938, 1650, 530, 883, 638, 1359, 1154, 2152, 117, 1257, 1898, 168], [178, 1813, 76, 290, 1810, 518, 337, 1172, 1242, 1151, 1057, 1981, 755, 923, 1942, 1560, 497, 1882, 1105], [150, 1645, 1636, 503, 281, 1940, 1633, 758, 1971, 376, 1405, 1156, 900, 1946], [1824, 1014, 2024, 1440, 1018, 565, 389, 1917, 1588, 399, 1259, 791, 1288], [1004, 901, 1621, 1580, 1499, 1751, 1037, 1323, 1534, 2178, 373, 1041], [1244, 42, 2128, 1063, 1597, 2104, 1292, 1915, 959, 2081, 1304], [767, 787, 186, 1625, 1651, 193, 1171, 2148, 1797, 1284], [1476, 2129, 388, 134, 1069, 682, 1680, 2186, 2198, 2106]]


In [8]:
from collections import Counter

def get_most_frequent_words(text, top_n=20):
    # Lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r"[^a-z\s]", "", text)

    # Tokenize
    words = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [w for w in words if w not in stop_words and len(w) > 2]

    # Count frequency
    counter = Counter(words)

    return counter.most_common(top_n)


In [9]:
def print_words_by_cluster(clusters, input_df):
    for i, cluster in enumerate(clusters):
        print(f"\nCluster {i+1}, {len(cluster)} elements ")
        sentences = input_df.iloc[cluster]["text"]
        all_text = " ".join(sentences)
        freq_words = get_most_frequent_words(all_text)
        print(freq_words)

In [10]:
print_words_by_cluster(clusters, bbc_df)


Cluster 1, 21 elements 
[('labour', 139), ('brown', 136), ('blair', 125), ('said', 125), ('election', 87), ('minister', 72), ('prime', 68), ('chancellor', 67), ('would', 65), ('party', 53), ('new', 46), ('campaign', 45), ('told', 41), ('book', 35), ('government', 35), ('gordon', 33), ('next', 32), ('claims', 32), ('tony', 29), ('milburn', 29)]

Cluster 2, 19 elements 
[('yukos', 175), ('said', 79), ('russian', 78), ('oil', 73), ('court', 55), ('gazprom', 55), ('rosneft', 50), ('russia', 47), ('yugansk', 46), ('company', 45), ('bankruptcy', 44), ('auction', 43), ('firm', 41), ('unit', 40), ('sale', 40), ('khodorkovsky', 33), ('government', 32), ('tax', 32), ('baikal', 30), ('yuganskneftegas', 29)]

Cluster 3, 14 elements 
[('kenteris', 59), ('greek', 52), ('thanou', 51), ('iaaf', 48), ('said', 39), ('athens', 35), ('tests', 34), ('drugs', 28), ('olympics', 28), ('charges', 25), ('also', 24), ('decision', 24), ('test', 24), ('athletics', 22), ('missing', 22), ('tribunal', 22), ('sprinte