In [1]:
import re
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%run -i "../util/lang_utils.ipynb"

In [3]:
bbc_df = pd.read_csv("../data/bbc-text.csv")
print(bbc_df)

           category                                               text
0              tech  tv future in the hands of viewers with home th...
1          business  worldcom boss  left books alone  former worldc...
2             sport  tigers wary of farrell  gamble  leicester say ...
3             sport  yeading face newcastle in fa cup premiership s...
4     entertainment  ocean s twelve raids box office ocean s twelve...
...             ...                                                ...
2220       business  cars pull down us retail figures us retail sal...
2221       politics  kilroy unveils immigration policy ex-chatshow ...
2222  entertainment  rem announce new glasgow concert us band rem h...
2223       politics  how political squabbles snowball it s become c...
2224          sport  souness delight at euro progress boss graeme s...

[2225 rows x 2 columns]


In [4]:
bbc_train, bbc_test = train_test_split(bbc_df, test_size=0.1)
print(len(bbc_train))
print(len(bbc_test))

2002
223


In [5]:
documents = bbc_train['text'].values
model = SentenceTransformer('all-MiniLM-L6-v2')
encoded_data = model.encode(documents)
km = KMeans(n_clusters=5, n_init='auto', init='k-means++')
km.fit(encoded_data)



0,1,2
,n_clusters,5
,init,'k-means++'
,n_init,'auto'
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,
,copy_x,True
,algorithm,'lloyd'


In [7]:
from collections import Counter
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def print_most_common_words_by_cluster(documents, kmeans_model, top_n=10):
    labels = kmeans_model.labels_

    for cluster_id in set(labels):
        print(f"\nCluster {cluster_id}")

        cluster_docs = [doc for doc, label in zip(documents, labels) if label == cluster_id]
        all_text = " ".join(cluster_docs)

        # Clean text
        all_text = all_text.lower()
        all_text = re.sub(r"[^a-z\s]", "", all_text)

        # Remove stopwords
        words = [w for w in all_text.split() if w not in ENGLISH_STOP_WORDS and len(w) > 2]

        # Count frequencies
        freq = Counter(words).most_common(top_n)

        print(freq)


In [8]:
print_most_common_words_by_cluster(documents, km, 5)


Cluster 0
[('said', 1993), ('government', 649), ('labour', 639), ('people', 563), ('blair', 511)]

Cluster 1
[('said', 1513), ('year', 574), ('company', 390), ('new', 382), ('market', 377)]

Cluster 2
[('said', 714), ('film', 711), ('best', 591), ('year', 350), ('music', 349)]

Cluster 3
[('said', 866), ('game', 446), ('england', 417), ('win', 368), ('world', 365)]

Cluster 4
[('said', 1489), ('people', 885), ('new', 481), ('technology', 471), ('mobile', 421)]


In [9]:
bbc_test["prediction"] = bbc_test["text"].apply(lambda x: km.predict(model.encode([x]))[0])
print(bbc_test)

           category                                               text  \
817        politics  no more concessions  on terror charles clarke ...   
1050  entertainment  tough schedule delays elliot show preview perf...   
1388  entertainment  jungle tv show ratings drop by 4m the finale o...   
1316          sport  hearts 2-1 livingston hearts wrapped up their ...   
1803  entertainment  super size me wins writers  award super size m...   
...             ...                                                ...   
43            sport  disappointed scott in solid start allan scott ...   
1143       politics  visa row mandarin made sir john the top civil ...   
1107       politics  blair stresses prosperity goals tony blair say...   
888   entertainment  aaliyah claim dismissed by court late r&b star...   
151            tech  slim playstation triples sales sony playstatio...   

      prediction  
817            0  
1050           2  
1388           2  
1316           3  
1803           2

In [10]:
topic_mapping = {0:"tech", 1:"sport", 2:"entertainment", 3:"politics", 4:"business"}
bbc_test["pred_category"] = bbc_test["prediction"].apply(lambda x: topic_mapping[x])
print(classification_report(bbc_test["category"], bbc_test["pred_category"]))

               precision    recall  f1-score   support

     business       0.00      0.00      0.00        43
entertainment       0.94      1.00      0.97        34
     politics       0.00      0.00      0.00        47
        sport       0.00      0.00      0.00        55
         tech       0.00      0.00      0.00        44

     accuracy                           0.15       223
    macro avg       0.19      0.20      0.19       223
 weighted avg       0.14      0.15      0.15       223



In [11]:
new_example = """Manchester United players slumped to the turf 
at full-time in Germany on Tuesday in acknowledgement of what their 
latest pedestrian first-half display had cost them. The 3-2 loss at 
RB Leipzig means United will not be one of the 16 teams in the draw 
for the knockout stages of the Champions League. And this is not the 
only price for failure. The damage will be felt in the accounts, in 
the dealings they have with current and potentially future players 
and in the faith the fans have placed in manager Ole Gunnar Solskjaer. 
With Paul Pogba's agent angling for a move for his client and ex-United 
defender Phil Neville speaking of a "witchhunt" against his former team-mate 
Solskjaer, BBC Sport looks at the ramifications and reaction to a big loss for United."""

In [12]:
predictions = km.predict(model.encode([new_example]))
print(predictions[0])

3
