In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm








In [3]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

In [10]:
def generateLabelEmbeddings(labels, model):
    embeddings = model.encode(labels, show_progress_bar=True)
    return embeddings

In [5]:
def generateDataEmbeddings(dataframe:pd.DataFrame, model:SentenceTransformer, columns:list):
    embeddings = model.encode(dataframe[columns].values, show_progress_bar=True)
    return embeddings
    

In [6]:
def readData(file:str)->pd.DataFrame:
    data = pd.read_csv("articles.csv")
    data = data.drop(columns='sentiment')
    data = data.dropna()
    return data

In [7]:
def getLabels(label_data):
    labels = label_data.category.to_list()
    return labels

In [8]:
def eval_performance(y_true, y_pred, labels):
    pred_label = [labels[pred] for pred in y_pred]
    performance = classification_report(y_true, pred_label, labels=labels)
    return performance

In [9]:
def getPredictions(data_embeddings, label_embeddings):
    sim_matrix = cosine_similarity(data_embeddings, label_embeddings)
    y_pred = np.argmax(sim_matrix, axis = 1)
    return y_pred

In [11]:
data = readData("articles.csv")
labels = getLabels(data)

In [63]:
new_labels = data.category.unique().tolist()
new_labels.append('Crime')

In [64]:
type(new_labels)

list

In [65]:
new_labels.remove("Uber")
new_labels.remove("Google")
new_labels.remove("Apple")
new_labels.remove("Meta")

In [66]:
print(new_labels)

['Sports', 'Finance', 'Fiction', 'Food', 'Business', 'Science', 'Fashion', 'Lifestyle', 'Entertainment', 'Gaming', 'Politics', 'Travel', 'Culture', 'Education', 'Automotive', 'Environment', 'Health', 'Tech', 'History', 'Crime']


In [67]:
data_embeddings = generateDataEmbeddings(data, model, ["title", "content"])
label_embeddings = generateLabelEmbeddings(new_labels, model)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches: 100%|██████████| 3/3 [00:00<00:00,  3.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 62.50it/s]


In [37]:
y_pred = getPredictions(data_embeddings, label_embeddings)

In [38]:
print(eval_performance(data.category, y_pred, new_labels))

               precision    recall  f1-score   support

       Sports       1.00      1.00      1.00         3
      Finance       1.00      1.00      1.00         3
      Fiction       1.00      1.00      1.00         3
         Food       1.00      1.00      1.00         3
     Business       0.60      1.00      0.75         3
      Science       1.00      1.00      1.00         3
       Google       1.00      1.00      1.00         3
        Apple       1.00      1.00      1.00         3
      Fashion       1.00      1.00      1.00         3
    Lifestyle       1.00      0.67      0.80         3
         Uber       1.00      1.00      1.00         3
Entertainment       0.00      0.00      0.00         3
       Gaming       1.00      1.00      1.00         3
     Politics       1.00      1.00      1.00         3
       Travel       1.00      1.00      1.00         3
      Culture       1.00      0.67      0.80         3
    Education       1.00      1.00      1.00         3
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
large_data = pd.read_csv('large_news_dataset_modified.csv')

In [44]:
large_data_embeddings = generateDataEmbeddings(large_data, model, ["title", "summary"])

Batches: 100%|██████████| 416/416 [02:34<00:00,  2.69it/s]


In [68]:
large_data_y_pred = getPredictions(large_data_embeddings, label_embeddings)

In [69]:
pred_label = [labels[pred] for pred in large_data_y_pred]

'Lifestyle'

In [95]:
index = 4
print(large_data.iloc[index])
print(pred_label[index])

title        Diverse areas face car insurance 'ethnicity' bill
summary      BBC Verify finds higher quotes in parts of Eng...
image_url    https://ichef.bbci.co.uk/ace/standard/240/cpsp...
published                        Mon, 26 Feb 2024 00:29:05 GMT
url               https://www.bbc.co.uk/news/business-68349396
images                                             IMAGE_5.jpg
category                                         Entertainment
Name: 4, dtype: object
Entertainment


In [None]:
for i in range(len(large_data)):
    large_data[i]["category"] = pred_label[i]

In [86]:
large_data["category"] = pred_label

In [92]:
large_data.iloc[4].summary

'BBC Verify finds higher quotes in parts of England even after crime and accident rates are included.'

In [74]:
test = large_data.iloc[10]

In [75]:
test["category"] = pred_label[10]

In [76]:
test

title        Ryanair warns of 10% fare rise as new planes d...
summary      Airline boss Michael O'Leary says prices could...
image_url    https://ichef.bbci.co.uk/ace/standard/240/cpsp...
published                        Mon, 26 Feb 2024 07:08:16 GMT
url               https://www.bbc.co.uk/news/business-68398421
images                                            IMAGE_11.jpg
category                                               Finance
Name: 10, dtype: object