In [6]:
import numpy as np 
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv('news_dataset.csv', encoding='latin-1')
data = df[['id', 'article']]
data.head()

Unnamed: 0,id,article
0,17307,PARIS ? When the Islamic State was about to...
1,17292,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Finally. The Second Avenue subway opened in Ne...
3,17311,WASHINGTON ? It?s or time for Republica...
4,17339,"For Megyn Kelly, the shift from Fox News to NB..."


In [9]:
data['article'][0]

'PARIS  ?   When the Islamic State was about to be driven out of the ancient city of Palmyra in March, Yves Ubelmann got a call from Syria?s director of antiquities to come over in a hurry. An architect by training, Mr. Ubelmann, 36, had worked in Syria before the country was engulfed by war. But now there was special urgency for the kind of work his youthful team of architects, mathematicians and designers did from their cramped offices in Paris: producing digital copies of threatened historical sites. Palmyra, parts of it already destroyed by the Islamists who deemed these monuments idolatrous, was still rigged with explosives. So he and Houmam Saad, his Syrian colleague, spent four days flying a drone with a robot camera over the crumbled arches and temples. ?Drones with four or six rotors can hover really close and register structural details, every crack and hole, and we can take very precise measurements,? said Mr. Ubelmann, who founded the company Iconem. ?This is the stuff arch

In [10]:
# Create a function to clean data
def clean_text(text):
    
    text = re.sub(r'(\s\?)',' ',text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"\b\?\b", "\'", text)
    text = re.sub(r"(,\?)",",", text)
    text = re.sub(r"\?+", "?", text)
    text = text.strip()

    return text

In [11]:
def chunk_text(data_index, data_text, chunk_size, chunk_overlap):

    list_chunk_text = []

    for position in range(len(data_index)):

        words = clean_text(data_text[position]).split()

        start = 0
        while start < len(words):
            end = start + chunk_size
            segment = ' '.join(words[start:end])
            list_chunk_text.append((data_index[position], segment))
            start += (chunk_size - chunk_overlap)

    return pd.DataFrame(list_chunk_text, columns=['id', 'article'])

In [12]:
data_chunk = chunk_text(data['id'], data['article'], 500, 50)

In [13]:
data_chunk.head()

Unnamed: 0,id,article
0,17307,PARIS When the Islamic State was about to be d...
1,17307,to mobilize public opinion in the face of the ...
2,17307,the guards at Mari reported that looters had c...
3,17292,Angels are everywhere in the Mu'iz family's ap...
4,17292,and his lower jaw and cut a hole through his e...


In [15]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [17]:
def get_tokenized_and_embedded_text(data, tokenizer, model):
    id_list = []

    for position in range(len(data)):
        # Tokenize
        encoding = tokenizer.encode_plus(data['article'][position],
                                       add_special_tokens=True,
                                       padding='longest',
                                       truncation=True,
                                       return_tensors='pt',
                                       )
        inputs = encoding['input_ids']
        
        # Get BERT model embeddings
        with torch.no_grad():
            outputs = model(inputs, attention_mask=encoding['attention_mask'])
            
        # Get embeddings from BERT output
        embeddings = outputs.last_hidden_state
        
        # Aggregate token-level embeddings (e.g., mean pooling)
        aggregated_embedding = torch.mean(embeddings, dim=1)

        # Add to list
        id_list.append((data['id'][position], aggregated_embedding))

    return id_list

In [18]:
id_list = get_tokenized_and_embedded_text(data_chunk, tokenizer, model)

In [19]:
id_df = pd.DataFrame(id_list, columns=['id', 'embedding'])

In [20]:
document_embeddings = [embed for embed in id_df['embedding']]
document_embeddings = torch.cat(document_embeddings)

In [21]:
query = 'Who is the vice chairman of Samsung?'
query = clean_text(query)

# Tokenize
encoding = tokenizer.encode_plus(query, add_special_tokens=True, padding='longest', truncation=True, return_tensors='pt')

# Get BERT model embeddings
with torch.no_grad():
    output = model(encoding['input_ids'], attention_mask=encoding['attention_mask'])

# Get embedding
query_embedding = output.last_hidden_state.mean(dim=1)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between the user query and all documents
similarities = cosine_similarity(query_embedding, document_embeddings)

# Find the index of the most similar document
id_df['cosine similarity'] = similarities.reshape(-1, 1)

In [23]:
id_df.head()

Unnamed: 0,id,embedding,cosine similarity
0,17307,"[[tensor(-0.0724), tensor(0.2000), tensor(0.21...",0.570158
1,17307,"[[tensor(-0.1836), tensor(0.1234), tensor(0.08...",0.574872
2,17307,"[[tensor(-0.1739), tensor(0.0417), tensor(0.06...",0.567738
3,17292,"[[tensor(-0.0999), tensor(0.0080), tensor(0.29...",0.56226
4,17292,"[[tensor(-0.1463), tensor(0.0739), tensor(0.32...",0.574617


In [24]:
most_similar_document = id_df.sort_values(by='cosine similarity', ascending=False).reset_index(drop=True)
most_similar_document

Unnamed: 0,id,embedding,cosine similarity
0,17692,"[[tensor(-0.0022), tensor(-0.2842), tensor(0.6...",0.684490
1,18113,"[[tensor(0.0653), tensor(-0.1018), tensor(0.32...",0.680131
2,17851,"[[tensor(-0.0975), tensor(-0.0283), tensor(0.2...",0.667798
3,18213,"[[tensor(-0.3222), tensor(-0.2396), tensor(0.2...",0.659565
4,17629,"[[tensor(-0.2943), tensor(-0.0678), tensor(0.1...",0.659119
...,...,...,...
2976,17506,"[[tensor(-0.3392), tensor(-0.4987), tensor(-0....",0.477313
2977,18186,"[[tensor(-0.2281), tensor(-0.1184), tensor(-0....",0.460685
2978,17313,"[[tensor(-0.2281), tensor(-0.1184), tensor(-0....",0.460685
2979,18185,"[[tensor(-0.2281), tensor(-0.1184), tensor(-0....",0.460685


In [25]:
most_similar_document['id'].tolist().index(17574)

12