### create a content-based recommender system using SBERT embeddings from article metadata and KMeans clustering to group articles into similar topic categories

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from sklearn.cluster import MiniBatchKMeans
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import torch
import os
print("PyTorch version: ",torch.__version__)

np.set_printoptions(threshold=np.inf)
pd.set_option('display.max_colwidth', None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"  # reduces load



PyTorch version:  2.7.1


In [2]:
news_df = pd.read_csv('news.tsv', sep='\t', header = None)
news_df.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

news_df = news_df.sample(n=10000, random_state=42) #randomly sample 10k rows from the news articles dataset
news_df.reset_index(drop = True, inplace = True)

In [3]:
news_df.info() #10K unique articles with a small fraction of the abstract column having null vlaues

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   news_id            10000 non-null  object
 1   category           10000 non-null  object
 2   subcategory        10000 non-null  object
 3   title              10000 non-null  object
 4   abstract           9470 non-null   object
 5   url                10000 non-null  object
 6   title_entities     9999 non-null   object
 7   abstract_entities  9999 non-null   object
dtypes: object(8)
memory usage: 625.1+ KB


In [4]:
news_df.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N7433,news,newsus,The 3 best Greek spots in Aurora,Hoodline crunched the numbers to find the top Greek spots around Aurora. Here's a rundown of three of the top places.,https://assets.msn.com/labs/mind/BBWHLdc.html,[],[]
1,N43326,sports,more_sports,Officials investigating stabbing death of Alexander Correctional Institute inmate,,https://assets.msn.com/labs/mind/AAJD8WV.html,"[{""Label"": ""Prison"", ""Type"": ""C"", ""WikidataId"": ""Q40357"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [52], ""SurfaceForms"": [""Correctional Institute""]}]",[]
2,N45745,finance,finance-top-stocks,"SmileDirectClub tanks on California bill, bringing loss since September IPO to nearly 60%",Shares of SmileDirectClub sank to a new all-time low during Monday's trading session after a California bill regulating the dental industry was extended through 2024.,https://assets.msn.com/labs/mind/AAILU2e.html,"[{""Label"": ""SmileDirectClub"", ""Type"": ""N"", ""WikidataId"": ""Q60751931"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [0], ""SurfaceForms"": [""SmileDirectClub""]}, {""Label"": ""Initial public offering"", ""Type"": ""U"", ""WikidataId"": ""Q185142"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [72], ""SurfaceForms"": [""IPO""]}]","[{""Label"": ""SmileDirectClub"", ""Type"": ""N"", ""WikidataId"": ""Q60751931"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [10], ""SurfaceForms"": [""SmileDirectClub""]}]"
3,N5234,travel,traveltips,The world's 50 most reliable airlines,"No one wants to kick off their holiday with delays, cancellations or bad customer service, so if you're keen for your next journey to be smooth sailing or smooth flying, rather travel insurance provider Get Going Travel Insurance has got you covered. They've examined the reliability of more than 100 of the world's airlines, looking at the percentage of cancellations and delays, as well as customer and safety ratings, and ranked the top 50 from worst to best. Here's our lowdown on the results.",https://assets.msn.com/labs/mind/AAJub6N.html,[],"[{""Label"": ""Get Going"", ""Type"": ""N"", ""WikidataId"": ""Q62076002"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [207], ""SurfaceForms"": [""Get Going""]}, {""Label"": ""Travel insurance"", ""Type"": ""U"", ""WikidataId"": ""Q1270407"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [181, 217], ""SurfaceForms"": [""travel insurance"", ""Travel Insurance""]}]"
4,N58530,sports,basketball_ncaa,High school football: Breaking down the Class 3A playoff bracket,"The Class 3A football playoffs begin Friday across the state. Here is a breakdown of the bracket. Player to watch Chase Ricke, Lincoln Christian The senior quarterback has been phenomenal this season. He has led an offense that is averaging 51 points per contest and could lead the Bulldogs to an upset against Heritage Hall in the semifinals. Team to watch Perkins-Tryon The Demons' ...",https://assets.msn.com/labs/mind/BBWAERc.html,"[{""Label"": ""South African Class 3A 4-8-2"", ""Type"": ""V"", ""WikidataId"": ""Q1722959"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [40], ""SurfaceForms"": [""Class 3A""]}]","[{""Label"": ""Heritage Hall School"", ""Type"": ""F"", ""WikidataId"": ""Q5738839"", ""Confidence"": 0.999, ""OccurrenceOffsets"": [311], ""SurfaceForms"": [""Heritage Hall""]}, {""Label"": ""South African Class 3A 4-8-2"", ""Type"": ""V"", ""WikidataId"": ""Q1722959"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [4], ""SurfaceForms"": [""Class 3A""]}]"


### use the sentence transformers library (which uses Pytorch and Hugging Face Transformers under the hood) to get sentence embeddings 

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight version


In [6]:
news_df.dropna(inplace = True)
news_df.reset_index(drop = True, inplace = True)

In [7]:
news_df.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N7433,news,newsus,The 3 best Greek spots in Aurora,Hoodline crunched the numbers to find the top Greek spots around Aurora. Here's a rundown of three of the top places.,https://assets.msn.com/labs/mind/BBWHLdc.html,[],[]
1,N45745,finance,finance-top-stocks,"SmileDirectClub tanks on California bill, bringing loss since September IPO to nearly 60%",Shares of SmileDirectClub sank to a new all-time low during Monday's trading session after a California bill regulating the dental industry was extended through 2024.,https://assets.msn.com/labs/mind/AAILU2e.html,"[{""Label"": ""SmileDirectClub"", ""Type"": ""N"", ""WikidataId"": ""Q60751931"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [0], ""SurfaceForms"": [""SmileDirectClub""]}, {""Label"": ""Initial public offering"", ""Type"": ""U"", ""WikidataId"": ""Q185142"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [72], ""SurfaceForms"": [""IPO""]}]","[{""Label"": ""SmileDirectClub"", ""Type"": ""N"", ""WikidataId"": ""Q60751931"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [10], ""SurfaceForms"": [""SmileDirectClub""]}]"
2,N5234,travel,traveltips,The world's 50 most reliable airlines,"No one wants to kick off their holiday with delays, cancellations or bad customer service, so if you're keen for your next journey to be smooth sailing or smooth flying, rather travel insurance provider Get Going Travel Insurance has got you covered. They've examined the reliability of more than 100 of the world's airlines, looking at the percentage of cancellations and delays, as well as customer and safety ratings, and ranked the top 50 from worst to best. Here's our lowdown on the results.",https://assets.msn.com/labs/mind/AAJub6N.html,[],"[{""Label"": ""Get Going"", ""Type"": ""N"", ""WikidataId"": ""Q62076002"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [207], ""SurfaceForms"": [""Get Going""]}, {""Label"": ""Travel insurance"", ""Type"": ""U"", ""WikidataId"": ""Q1270407"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [181, 217], ""SurfaceForms"": [""travel insurance"", ""Travel Insurance""]}]"
3,N58530,sports,basketball_ncaa,High school football: Breaking down the Class 3A playoff bracket,"The Class 3A football playoffs begin Friday across the state. Here is a breakdown of the bracket. Player to watch Chase Ricke, Lincoln Christian The senior quarterback has been phenomenal this season. He has led an offense that is averaging 51 points per contest and could lead the Bulldogs to an upset against Heritage Hall in the semifinals. Team to watch Perkins-Tryon The Demons' ...",https://assets.msn.com/labs/mind/BBWAERc.html,"[{""Label"": ""South African Class 3A 4-8-2"", ""Type"": ""V"", ""WikidataId"": ""Q1722959"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [40], ""SurfaceForms"": [""Class 3A""]}]","[{""Label"": ""Heritage Hall School"", ""Type"": ""F"", ""WikidataId"": ""Q5738839"", ""Confidence"": 0.999, ""OccurrenceOffsets"": [311], ""SurfaceForms"": [""Heritage Hall""]}, {""Label"": ""South African Class 3A 4-8-2"", ""Type"": ""V"", ""WikidataId"": ""Q1722959"", ""Confidence"": 1.0, ""OccurrenceOffsets"": [4], ""SurfaceForms"": [""Class 3A""]}]"
4,N17133,news,newspolitics,"At a UFC event, Trump receives second round of boos in a week",Trump can't seem to escape boos at sporting events.,https://assets.msn.com/labs/mind/AAJMx3j.html,"[{""Label"": ""Donald Trump"", ""Type"": ""P"", ""WikidataId"": ""Q22686"", ""Confidence"": 0.985, ""OccurrenceOffsets"": [16], ""SurfaceForms"": [""Trump""]}, {""Label"": ""Ultimate Fighting Championship"", ""Type"": ""O"", ""WikidataId"": ""Q186471"", ""Confidence"": 0.999, ""OccurrenceOffsets"": [5], ""SurfaceForms"": [""UFC""]}]","[{""Label"": ""Donald Trump"", ""Type"": ""P"", ""WikidataId"": ""Q22686"", ""Confidence"": 0.985, ""OccurrenceOffsets"": [0], ""SurfaceForms"": [""Trump""]}]"


In [8]:
#create helper function to extract the labels and surface forms from the title_entities and abstract columns
def extract_entities(ent_list: str) -> list:
    master_list = []
    for item in ast.literal_eval(ent_list):
        master_list.append(item["Label"])
        if item["SurfaceForms"]:
            master_list.append(item["SurfaceForms"][0])
    
    return list(set(master_list))

In [9]:
news_df['title_entities'] = news_df['title_entities'].apply(extract_entities)
news_df['abstract_entities'] = news_df['abstract_entities'].apply(extract_entities)


In [10]:
news_df.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N7433,news,newsus,The 3 best Greek spots in Aurora,Hoodline crunched the numbers to find the top Greek spots around Aurora. Here's a rundown of three of the top places.,https://assets.msn.com/labs/mind/BBWHLdc.html,[],[]
1,N45745,finance,finance-top-stocks,"SmileDirectClub tanks on California bill, bringing loss since September IPO to nearly 60%",Shares of SmileDirectClub sank to a new all-time low during Monday's trading session after a California bill regulating the dental industry was extended through 2024.,https://assets.msn.com/labs/mind/AAILU2e.html,"[Initial public offering, SmileDirectClub, IPO]",[SmileDirectClub]
2,N5234,travel,traveltips,The world's 50 most reliable airlines,"No one wants to kick off their holiday with delays, cancellations or bad customer service, so if you're keen for your next journey to be smooth sailing or smooth flying, rather travel insurance provider Get Going Travel Insurance has got you covered. They've examined the reliability of more than 100 of the world's airlines, looking at the percentage of cancellations and delays, as well as customer and safety ratings, and ranked the top 50 from worst to best. Here's our lowdown on the results.",https://assets.msn.com/labs/mind/AAJub6N.html,[],"[Travel insurance, travel insurance, Get Going]"
3,N58530,sports,basketball_ncaa,High school football: Breaking down the Class 3A playoff bracket,"The Class 3A football playoffs begin Friday across the state. Here is a breakdown of the bracket. Player to watch Chase Ricke, Lincoln Christian The senior quarterback has been phenomenal this season. He has led an offense that is averaging 51 points per contest and could lead the Bulldogs to an upset against Heritage Hall in the semifinals. Team to watch Perkins-Tryon The Demons' ...",https://assets.msn.com/labs/mind/BBWAERc.html,"[South African Class 3A 4-8-2, Class 3A]","[Heritage Hall, Heritage Hall School, South African Class 3A 4-8-2, Class 3A]"
4,N17133,news,newspolitics,"At a UFC event, Trump receives second round of boos in a week",Trump can't seem to escape boos at sporting events.,https://assets.msn.com/labs/mind/AAJMx3j.html,"[Donald Trump, Ultimate Fighting Championship, Trump, UFC]","[Donald Trump, Trump]"


In [11]:
news_df['text'] = 'Category: ' + news_df['category'] + " Subcategory: " + news_df['subcategory'] + ' Title: ' + news_df['title'] + ' Abstract: ' + news_df['abstract'] + ' Title Entities: ' + news_df['title_entities'].astype(str) + ' Abstract Entities: ' + news_df['abstract_entities'].astype(str)

In [12]:
news_df['text'][0]

"Category: news Subcategory: newsus Title: The 3 best Greek spots in Aurora Abstract: Hoodline crunched the numbers to find the top Greek spots around Aurora. Here's a rundown of three of the top places. Title Entities: [] Abstract Entities: []"

In [13]:
embeddings = model.encode(news_df['text'].tolist(), show_progress_bar=True)


Batches:   0%|          | 0/296 [00:00<?, ?it/s]

In [14]:
news_df['embeddings'] = list(embeddings)

X = np.vstack(news_df['embeddings'].values)  # if stored in df
# Or just use `X = embeddings` if it's already a list of vectors
print(X.shape)
# Choose number of clusters 
num_clusters = 10
kmeans = MiniBatchKMeans(n_clusters=num_clusters, n_init=10, random_state=42)
news_df['cluster'] = kmeans.fit_predict(X)


(9469, 384)


In [15]:
news_df.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,text,embeddings,cluster
0,N7433,news,newsus,The 3 best Greek spots in Aurora,Hoodline crunched the numbers to find the top Greek spots around Aurora. Here's a rundown of three of the top places.,https://assets.msn.com/labs/mind/BBWHLdc.html,[],[],Category: news Subcategory: newsus Title: The 3 best Greek spots in Aurora Abstract: Hoodline crunched the numbers to find the top Greek spots around Aurora. Here's a rundown of three of the top places. Title Entities: [] Abstract Entities: [],"[-0.0028714268, -0.012406576, -0.010510382, -0.00033832458, 0.01477537, 0.003346398, 0.04620887, -0.047169324, 0.027122928, -0.025972482, -0.05574391, -0.012776277, -0.048546232, 0.017060187, 0.036839932, -0.016281392, 0.016000574, 0.0031876867, -0.012342488, -0.1242588, 0.018897204, -0.001264528, 0.06556245, 0.030269928, 0.0693234, 0.0034582098, -0.07075851, -0.038983785, -0.0046672653, -0.1321, -0.0058786455, 0.020214334, 0.07407098, -0.0020228084, 0.023499567, -0.0060727503, -0.015522124, -0.08856686, 0.048578344, 0.082162276, 0.0120161995, -0.0061363988, 0.056455944, -0.050618887, -0.05251461, 0.06010838, -0.03539341, -0.01179861, 0.03655261, 0.04411695, -0.04429494, -0.06139248, -0.06448932, 0.058929652, -0.028792704, 0.106542625, -0.0668507, -0.08875269, -0.030622179, -0.06840991, 0.00020722735, 0.06198799, 0.04557125, 0.055675644, -0.012440562, -0.0074731335, -0.016691072, 0.0015629217, -0.038325388, -0.03853756, 0.08749337, 0.027386313, -0.007979394, -0.0071949293, -0.04485557, 0.07752511, 0.15247214, -0.05627357, -0.032708365, -0.058071278, 0.011052652, 0.055300158, 0.009719215, 0.03882131, -0.0012946435, 0.014018182, -0.037162032, -0.07451183, -0.026181875, 0.02916354, 0.011100049, -0.047739465, 0.061230067, 0.009274515, -0.0356204, 0.08921005, 0.021686004, -0.023303313, 0.015107069, 0.04051619, ...]",4
1,N45745,finance,finance-top-stocks,"SmileDirectClub tanks on California bill, bringing loss since September IPO to nearly 60%",Shares of SmileDirectClub sank to a new all-time low during Monday's trading session after a California bill regulating the dental industry was extended through 2024.,https://assets.msn.com/labs/mind/AAILU2e.html,"[Initial public offering, SmileDirectClub, IPO]",[SmileDirectClub],"Category: finance Subcategory: finance-top-stocks Title: SmileDirectClub tanks on California bill, bringing loss since September IPO to nearly 60% Abstract: Shares of SmileDirectClub sank to a new all-time low during Monday's trading session after a California bill regulating the dental industry was extended through 2024. Title Entities: ['Initial public offering', 'SmileDirectClub', 'IPO'] Abstract Entities: ['SmileDirectClub']","[0.019418871, -0.029889008, -0.02698203, 0.027121253, -0.029645966, 0.012073582, -0.005036822, 0.037253927, -0.011163449, -0.039691936, -0.0041397237, 0.04525067, -0.04478833, 0.0036313056, 0.007907383, -0.013749907, -0.026034128, 0.0003514292, -0.01106555, -0.0040224846, -0.028081397, 0.04063494, -0.06111155, 0.005763905, 0.04990365, -0.004318754, -0.04789995, 0.023376856, -0.06318574, -0.05825793, -0.0153263165, 0.08158974, 0.08669919, -0.032437764, 0.059725158, -0.009012225, -0.057826467, -0.018305201, 0.07916929, -0.0047352007, 0.015348422, -0.062417656, -0.04443955, 0.0085130865, -0.0322787, 0.021740917, 0.00490971, 0.021154415, 0.04189495, 0.0293044, -0.039586022, -0.07448827, 0.0023341614, 0.036699083, -0.049893796, -0.0069751134, -0.08411986, -0.08076229, 0.061851405, 0.044762813, 0.06668875, 0.033188257, 0.045183964, 0.059578713, 0.024957282, 0.012474416, -0.0007240179, 0.04541393, -0.05268118, -0.017998595, 0.060655754, -0.015770406, -0.017802276, 0.009141741, -0.0077556535, -0.02584622, 0.09475208, 0.06983568, 0.07649976, -0.086786434, -0.0113150235, -0.017288432, -0.04400228, 0.0018659553, -0.031613324, 0.04837111, -0.00892245, -0.019164756, -0.06492086, 0.03872528, -0.03801895, -0.025267648, 0.15879062, -0.078855984, -0.021105418, 0.016392928, -0.020452559, -0.04327261, 0.03659211, 0.04267475, ...]",3
2,N5234,travel,traveltips,The world's 50 most reliable airlines,"No one wants to kick off their holiday with delays, cancellations or bad customer service, so if you're keen for your next journey to be smooth sailing or smooth flying, rather travel insurance provider Get Going Travel Insurance has got you covered. They've examined the reliability of more than 100 of the world's airlines, looking at the percentage of cancellations and delays, as well as customer and safety ratings, and ranked the top 50 from worst to best. Here's our lowdown on the results.",https://assets.msn.com/labs/mind/AAJub6N.html,[],"[Travel insurance, travel insurance, Get Going]","Category: travel Subcategory: traveltips Title: The world's 50 most reliable airlines Abstract: No one wants to kick off their holiday with delays, cancellations or bad customer service, so if you're keen for your next journey to be smooth sailing or smooth flying, rather travel insurance provider Get Going Travel Insurance has got you covered. They've examined the reliability of more than 100 of the world's airlines, looking at the percentage of cancellations and delays, as well as customer and safety ratings, and ranked the top 50 from worst to best. Here's our lowdown on the results. Title Entities: [] Abstract Entities: ['Travel insurance', 'travel insurance', 'Get Going']","[0.019200498, -0.013533507, -0.01395454, 0.03915099, 0.017132845, 0.026899269, 0.1528933, 0.006459362, -0.009931688, 0.0065448433, 0.03939432, 0.016975325, 0.0073282784, 0.06772342, -0.019284323, -0.030575242, 0.03367165, -0.030578006, -0.062013235, -0.027846819, -0.07969087, 0.070592724, 0.010382161, 0.040707875, -0.025858581, -0.004059862, -0.003717365, 0.02157166, -0.05792055, -0.021673795, -0.06752408, 0.016718205, -0.07263573, -0.012943322, -0.017434785, -0.05450153, -0.12901703, -0.10100388, -0.009552786, 0.048831757, -0.0119774835, 0.01735975, 0.025723586, 0.0027186815, -0.025585268, -0.07214427, -0.016462198, 0.030921848, -0.014462702, 0.1056655, 0.0008873331, 0.0068694293, 0.048645377, -0.058006726, -0.009493116, -0.021355636, -0.1098802, -0.015639553, -0.08917588, 0.0011317007, -0.034879368, -0.0012863075, -0.014105497, 0.045950845, -0.054415908, 0.05967065, -0.010533398, 0.03129959, -0.0184637, -0.008568929, -0.043403767, -0.055907052, -0.049958963, 0.09983528, 0.015706968, 0.040398624, 0.08284105, -0.040848803, -0.02787046, 0.0018757512, 0.024497269, -0.0048443736, 0.0054136664, 0.024494626, 0.08552736, -0.013539632, 0.011181781, 0.014476082, -0.09661596, -0.078500025, 0.011174522, -0.0292577, 0.12034076, 0.024287825, -0.021351228, 0.057209734, -0.01648401, -0.08420281, -0.03936952, -0.0020698884, ...]",3
3,N58530,sports,basketball_ncaa,High school football: Breaking down the Class 3A playoff bracket,"The Class 3A football playoffs begin Friday across the state. Here is a breakdown of the bracket. Player to watch Chase Ricke, Lincoln Christian The senior quarterback has been phenomenal this season. He has led an offense that is averaging 51 points per contest and could lead the Bulldogs to an upset against Heritage Hall in the semifinals. Team to watch Perkins-Tryon The Demons' ...",https://assets.msn.com/labs/mind/BBWAERc.html,"[South African Class 3A 4-8-2, Class 3A]","[Heritage Hall, Heritage Hall School, South African Class 3A 4-8-2, Class 3A]","Category: sports Subcategory: basketball_ncaa Title: High school football: Breaking down the Class 3A playoff bracket Abstract: The Class 3A football playoffs begin Friday across the state. Here is a breakdown of the bracket. Player to watch Chase Ricke, Lincoln Christian The senior quarterback has been phenomenal this season. He has led an offense that is averaging 51 points per contest and could lead the Bulldogs to an upset against Heritage Hall in the semifinals. Team to watch Perkins-Tryon The Demons' ... Title Entities: ['South African Class 3A 4-8-2', 'Class 3A'] Abstract Entities: ['Heritage Hall', 'Heritage Hall School', 'South African Class 3A 4-8-2', 'Class 3A']","[-0.11511074, -0.032843888, -0.058183517, -0.09700683, 0.031531267, 0.024347998, -0.04937914, -0.032756545, 0.032205123, 0.098657, -0.12369766, -0.06454328, 0.009443382, -0.03749836, 0.013233498, -0.047897894, -0.035472263, 0.0062342254, -0.0016823766, -0.053756066, 0.010293754, 0.0057448614, -0.015800154, -0.0011301448, 0.07033864, 0.014964783, 0.0015336461, -0.038836885, -0.09564163, -0.048459098, 0.05126467, -0.07150712, 0.026527457, 0.0056894575, -0.0030998567, -0.07493184, -0.0036630973, 0.068029456, 0.015554497, 0.030898973, 0.022943959, -0.014581212, 0.06408882, -0.032759454, 0.012383873, -0.082730286, -0.065757036, -0.053441633, 0.020372462, -0.026028145, -0.04913079, 0.03433102, -0.106079735, 0.0025703048, -0.015425894, 0.09379104, 0.019927124, 0.009212188, -0.0038376779, -0.04347766, 0.03260555, -0.014695522, -0.009275464, 0.00027226922, -0.010878201, -0.047620278, -0.082213104, 0.05097574, 0.0035189907, 0.062314842, 0.037955347, -0.0023546943, -0.065135404, -0.0075012315, 0.06403424, 0.098092146, 0.048093792, 0.07579037, 0.06053895, 0.016412703, -0.03385757, -0.119783185, 0.021407314, 0.010809777, 0.022538684, -0.013410404, -0.049665295, -0.05362427, 0.025445268, 0.04267426, -0.014480574, -0.08447785, 0.047228977, -0.01142409, -0.009638873, 0.030639807, -0.07297145, -0.13954657, 0.04095479, 0.09963459, ...]",9
4,N17133,news,newspolitics,"At a UFC event, Trump receives second round of boos in a week",Trump can't seem to escape boos at sporting events.,https://assets.msn.com/labs/mind/AAJMx3j.html,"[Donald Trump, Ultimate Fighting Championship, Trump, UFC]","[Donald Trump, Trump]","Category: news Subcategory: newspolitics Title: At a UFC event, Trump receives second round of boos in a week Abstract: Trump can't seem to escape boos at sporting events. Title Entities: ['Donald Trump', 'Ultimate Fighting Championship', 'Trump', 'UFC'] Abstract Entities: ['Donald Trump', 'Trump']","[0.06410005, 0.0478221, 0.005382711, 0.025049195, 0.026418125, 0.047663104, 0.011788153, -0.11290559, 0.08634132, -0.016913367, -0.058042254, -0.10954013, -0.071645126, 0.10605225, 0.05072567, 0.0084997425, -0.0073748236, 0.0718592, -0.007780189, -0.028232418, 0.03235928, 0.04068845, 0.020330027, 0.013648945, 0.009199657, -0.03637407, -0.07184759, -0.014539548, -0.040090308, -0.09570119, -0.0131613845, -0.019567035, 0.013519175, 0.05427861, -0.013042894, -0.05049865, -0.0804474, 0.01239617, 0.054220293, 0.03651505, -0.014377174, -0.118406475, 0.0018455836, -0.053126827, -0.0024862352, 0.03925861, -0.06613436, 0.007411063, 0.06715203, 0.007992903, 0.015450345, -0.04548756, 0.048847057, 0.07809236, 0.060614273, 0.0068170917, -0.025693024, -0.0032833132, -0.03363651, 0.011615113, 0.01687941, 0.008215265, -0.0045137927, 0.07281191, -0.021110581, -0.0068338355, -0.07022012, 0.06244115, 0.023751901, 0.05078435, 0.08151241, 0.038846605, 0.013501798, 0.0038909789, 0.058685355, 0.0039621745, 0.057987265, 0.007603529, 0.09440943, -0.01535063, -0.024639739, 0.0019124557, -0.004427786, -0.1111408, 0.079125345, 0.044695646, -0.034941237, -0.049271483, -0.14406645, 0.045466863, -0.080632724, -0.08208613, 0.13012627, 0.05978401, -0.055564348, 0.00025743956, -0.014270926, -0.0063318615, -0.023633746, 0.023869338, ...]",4


In [16]:
print(news_df['cluster'].value_counts())

cluster
3    1647
7    1289
2    1202
1    1199
9    1010
0     852
8     763
4     635
6     471
5     401
Name: count, dtype: int64


In [17]:
for c in range(num_clusters):
    print(f"\nCluster {c}")
    print(news_df[news_df['cluster'] == c]['title'].head(5))  



Cluster 0
7                                         Mom gets to hear son's heartbeat again 5 years after his death
12                                  San Antonio man considered habitual DWI offender on trial for murder
17    California boy, 13, complained of bullying at school days before fatal sucker punch, attorney says
23                                       Pinellas Elementary Teacher Accused Of Showing Up To Work Drunk
26                      Police car rolls over woman's leg, leaving her critically injured in South Shore
Name: title, dtype: object

Cluster 1
14          'The Irishman' Lenser on How the Mind of the Killer Influenced the Cinematography
56                                17 Winter Date Ideas That Will Melt Your Cold, Frozen Heart
62    Jennifer Lopez shares her #MeToo experience at costume fitting: 'I stood up for myself'
67                                 Instagrammers Help Discover 'Japan Pig' Seahorse In Taiwan
74                             'Self-Partnered' Is

In [29]:
# 4. Recommendation function — top 5 in same cluster by cosine similarity

def recommend_articles_mind(article_idx, top_n=5):
    # Check article exists and has cluster
    if article_idx not in news_df.index or pd.isna(news_df.loc[article_idx, 'cluster']):
        return "Article index invalid or missing cluster."

    target_cluster = news_df.loc[article_idx, 'cluster']
    cluster_indices = news_df[news_df['cluster'] == target_cluster].index.tolist()

    # Get embeddings for those indices (careful if indices are not continuous)
    cluster_embeddings = embeddings[[news_df['text'].tolist().index(news_df.loc[i, 'text']) for i in cluster_indices]]
    query_embedding = embeddings[news_df['text'].tolist().index(news_df.loc[article_idx, 'text'])].reshape(1, -1)

    similarities = cosine_similarity(query_embedding, cluster_embeddings)[0]

    # Sort top_n excluding self
    sorted_idx = np.argsort(similarities)[::-1]
    sorted_idx = [i for i in sorted_idx if cluster_indices[i] != article_idx][:top_n]

    recommended_indices = [cluster_indices[i] for i in sorted_idx]

    recommendations = news_df.loc[recommended_indices][['title', 'category', 'abstract']]
    recommendations['similarity'] = similarities[sorted_idx]

    return recommendations.reset_index(drop=True)

print("Recommendations for article:", news_df.loc[200, 'title'])
pd.DataFrame(recommend_articles_mind(article_idx=200, top_n=5))


Recommendations for article: Couple's haunting wedding photo captures California wildfire raging nearby


Unnamed: 0,title,category,abstract,similarity
0,Hillside brush fire in Los Angeles threatens affluent Pacific Palisades homes; evacuations lifted,news,California firefighters are wrestling with the Palisades Fire burning in a Los Angeles hillside and threatening multimillion-dollar homes.,0.68686
1,Paradise rebuilds but danger may still lurk year after fire,news,"There was ""no way in hell"" Victoria Sinclaire was rebuilding in Paradise. She'd thought she was going to die during the six hours it took her to escape the deadliest and most destructive wildfire in California history.",0.629955
2,Authorities: 3 deaths tied to Southern California wildfires,news,Crews are battling a brush fire that prompted evacuations and closed Interstate 210 in the Sylmar neighborhood of the San Fernando Valley in Los Angeles. Several vehicles burned in a nearby industrial complex. (Oct. 11),0.614801
3,Time-lapse shows Kincade wildfire,news,"Mandatory evacuations have been ordered in parts of Sonoma county, California as the Kincade fire continues to burn thousands of acres.",0.612559
4,One person dead as small plane crashes into house outside of Los Angeles,news,"The craft came down on a single-story home in Upland, California.",0.600008
