In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import AutoTokenizer, TFAutoModel

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv('times.csv')
df

Unnamed: 0,Title,Summary,Date,Link
0,Do we need a health check-up before signing fo...,As more incidences of heart attacks in gyms an...,"Dec 10, 2022",https://timesofindia.indiatimes.com/life-style...
1,Should you fast before having a blood test?,We undergo several blood tests in a year. Sinc...,"Dec 9, 2022",https://timesofindia.indiatimes.com/life-style...
2,As Japanese encephalitis cases reported in cou...,Japanese encephalitis is an infection caused b...,"Dec 9, 2022",https://timesofindia.indiatimes.com/life-style...
3,Coronavirus: Nearly half of COVID patients glo...,More than 45% of the study participants had at...,"Dec 9, 2022",https://timesofindia.indiatimes.com/life-style...
4,Non-alcoholic fatty liver disease: Waking up b...,According to the journal of Nature and Science...,"Dec 8, 2022",https://timesofindia.indiatimes.com/life-style...
5,"Coronavirus: Prevalence of COVID XBB, BA.2.75 ...","""The number of new weekly deaths decreased by ...","Dec 8, 2022",https://timesofindia.indiatimes.com/life-style...
6,Coronavirus: Depressive symptoms of gut seen c...,Interference of any foreign pathogen in this c...,"Dec 7, 2022",https://timesofindia.indiatimes.com/life-style...
7,Coronavirus: From getting a bald patch to losi...,But how is Coronavirus linked to hair fall or ...,"Dec 7, 2022",https://timesofindia.indiatimes.com/life-style...
8,"""Don't neglect a chest pain as gastritis"": Doc...",From assuming it as a minor gastric issue to i...,"Dec 7, 2022",https://timesofindia.indiatimes.com/life-style...
9,Coronavirus: Top 5 COVID symptoms that show up...,Sore throat is one of the most common symptoms...,"Dec 6, 2022",https://timesofindia.indiatimes.com/life-style...


In [4]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFMPNetModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFMPNetModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFMPNetModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFMPNetModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMPNetModel for predictions without further training.


In [5]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(df)
comments_dataset

Dataset({
    features: ['Title', 'Summary', 'Date', 'Link'],
    num_rows: 10
})

In [8]:
def concatenate_text(examples):
    return {
        "text": examples["Title"]
        + " \n "
        + examples["Summary"]
        
    }


comments_dataset = comments_dataset.map(concatenate_text)

  0%|          | 0/10 [00:00<?, ?ex/s]

In [9]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]
    

In [10]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="tf"
    )
    encoded_input = {k: v for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [11]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

TensorShape([1, 768])

In [12]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]}
)



  0%|          | 0/10 [00:00<?, ?ex/s]

In [13]:
embeddings_dataset.add_faiss_index(column='embeddings')

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Title', 'Summary', 'Date', 'Link', 'text', 'embeddings'],
    num_rows: 10
})

In [14]:
embeddings_dataset.shape

(10, 6)

In [15]:
# embeddings_dataset['embeddings']

In [16]:
# # Create a CPU index
# index = faiss.IndexFlatL2(768)

# # Add the vectors to the index
# embeddings = index.add(embeddings_df)

# embeddings_dataset.add_faiss_index(column="embeddings")

In [17]:
question = "Is there any update on CORONA vaccine?"
question_embedding = get_embeddings([question]).numpy()
question_embedding.shape

(1, 768)

In [18]:
question_embedding

array([[-1.19242772e-01, -4.13819075e-01, -3.50076437e-01,
        -2.42480680e-01,  1.08316854e-01, -2.24042654e-01,
         7.80198723e-02,  1.95240706e-01, -1.30238160e-01,
         2.12676048e-01,  1.87163383e-01,  5.90314269e-02,
         3.86618637e-02, -1.17866725e-01, -2.12678075e-01,
        -1.67647392e-01,  2.04911321e-01, -1.79678932e-01,
         3.41955274e-01, -1.71086729e-01, -8.45142305e-02,
         2.97977149e-01, -5.34341216e-01,  9.17647928e-02,
         1.07703447e-01, -7.13119805e-02, -6.97793141e-02,
        -2.49723583e-01,  2.37348098e-02, -3.13850939e-01,
         2.93211669e-01,  1.48672372e-01,  2.02350020e-01,
         1.80395961e-01, -9.23579501e-05, -2.01253712e-01,
         2.67673373e-01,  1.91574574e-01, -5.99386655e-02,
         9.77628380e-02,  2.02653140e-01, -1.57335401e-01,
        -7.28461146e-02,  4.41589803e-02, -1.13632157e-01,
        -2.09254041e-01,  2.33538881e-01,  7.53035694e-02,
         2.13179320e-01, -1.37086809e-02,  3.99999291e-0

In [19]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [20]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [25]:
print(samples_df.columns)


Index(['Title', 'Summary', 'Date', 'Link', 'text', 'embeddings', 'scores'], dtype='object')


In [27]:
for _, row in samples_df.iterrows():
    
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.Title}")
    print(f"URL: {row.Link}")
    print("=" * 50)
    

SCORE: 53.02634811401367
TITLE: As Japanese encephalitis cases reported in country, key symptoms to know
URL: https://timesofindia.indiatimes.com/life-style/health-fitness/health-news/as-japanese-encephalitis-cases-reported-in-country-key-symptoms-to-know/photostory/96106220.cms
SCORE: 45.043243408203125
TITLE: Coronavirus: From getting a bald patch to losing hair volume; experts explain hair fall and thinning during the COVID pandemic
URL: https://timesofindia.indiatimes.com/life-style/health-fitness/health-news/coronavirus-from-getting-a-bald-patch-to-losing-hair-volume-experts-explain-hair-fall-and-thinning-during-the-covid-pandemic/articleshow/96052616.cms
SCORE: 42.03228759765625
TITLE: Coronavirus: Top 5 COVID symptoms that show up in fully vaccinated people
URL: https://timesofindia.indiatimes.com/life-style/health-fitness/health-news/coronavirus-top-5-covid-symptoms-that-show-up-in-fully-vaccinated-people/photostory/96024049.cms
SCORE: 39.02573776245117
TITLE: Coronavirus: Near