In [127]:
import numpy as np 
import pandas as pd
import re
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import spacy
# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from transformers import BertTokenizer
from transformers import BertModel
from transformers import BertForQuestionAnswering
import torch

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BERT model for question answering
model = BertModel.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('news_dataset.csv', encoding='latin-1')
df.head()

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


In [4]:
def chunk_text(data_index, data_text, chunk_size, chunk_overlap):

    list_chunk_text = []

    for position in range(len(data_index)):
        # text = re.sub(r'\s+', ' ', data_text[position])

        words = data_text[position].split()

        start = 0
        while start < len(words):
            end = start + chunk_size
            segment = ' '.join(words[start:end])
            list_chunk_text.append((data_index[position], segment))
            start += (chunk_size - chunk_overlap)

    return list_chunk_text

In [48]:
data = df[['id', 'article']]
data['article'][0]

'PARIS  ?   When the Islamic State was about to be driven out of the ancient city of Palmyra in March, Yves Ubelmann got a call from Syria?s director of antiquities to come over in a hurry. An architect by training, Mr. Ubelmann, 36, had worked in Syria before the country was engulfed by war. But now there was special urgency for the kind of work his youthful team of architects, mathematicians and designers did from their cramped offices in Paris: producing digital copies of threatened historical sites. Palmyra, parts of it already destroyed by the Islamists who deemed these monuments idolatrous, was still rigged with explosives. So he and Houmam Saad, his Syrian colleague, spent four days flying a drone with a robot camera over the crumbled arches and temples. ?Drones with four or six rotors can hover really close and register structural details, every crack and hole, and we can take very precise measurements,? said Mr. Ubelmann, who founded the company Iconem. ?This is the stuff arch

In [49]:
data_chunk = chunk_text(data['id'], data['article'], 300, 50)
data_chunk[:5]

[(17307,
  'PARIS ? When the Islamic State was about to be driven out of the ancient city of Palmyra in March, Yves Ubelmann got a call from Syria?s director of antiquities to come over in a hurry. An architect by training, Mr. Ubelmann, 36, had worked in Syria before the country was engulfed by war. But now there was special urgency for the kind of work his youthful team of architects, mathematicians and designers did from their cramped offices in Paris: producing digital copies of threatened historical sites. Palmyra, parts of it already destroyed by the Islamists who deemed these monuments idolatrous, was still rigged with explosives. So he and Houmam Saad, his Syrian colleague, spent four days flying a drone with a robot camera over the crumbled arches and temples. ?Drones with four or six rotors can hover really close and register structural details, every crack and hole, and we can take very precise measurements,? said Mr. Ubelmann, who founded the company Iconem. ?This is the st

In [97]:
def get_tokenized_and_embedded_text(list_id_text, tokenizer, model):
    id_dict = {}

    for position in range(len(list_id_text)):
        # Tokenize
        encoding = tokenizer.encode_plus(list_id_text[position][1],
                                       add_special_tokens=True,
                                       padding='longest',
                                       return_tensors='pt',
                                       )
        inputs = encoding['input_ids']
        
        # Get BERT model embeddings
        with torch.no_grad():
            outputs = model(inputs, attention_mask=encoding['attention_mask'])
            
        # Get embeddings from BERT output
        embeddings = outputs.last_hidden_state
        
        # Aggregate token-level embeddings (e.g., mean pooling)
        aggregated_embedding = torch.mean(embeddings, dim=1)

        # Add to dictionary
        if list_id_text[position][0] not in id_dict:
            id_dict[list_id_text[position][0]] = [(inputs, aggregated_embedding)]
        else:
            id_dict[list_id_text[position][0]].append((inputs, aggregated_embedding))

    return id_dict

In [51]:
id_dict = get_tokenized_and_embedded_text(data_chunk, tokenizer, model)

In [77]:
type(id_dict)

dict

In [55]:
id_dict[17307][0][0].shape

torch.Size([1, 382])

In [117]:
query = 'Who is the vice chairman of Samsung?'

# Tokenize
encoding = tokenizer.encode_plus(query, padding=True, return_tensors='pt')

# Get BERT model embeddings
with torch.no_grad():
    output = model(encoding['input_ids'], attention_mask=encoding['attention_mask'])

# Get embedding
query_embedding = output.last_hidden_state.mean(dim=1)

In [86]:
def cosine_similarity_with_id(query, context):
    list_similarity = []
    for id_ in context:
        for value in context[id_]:
            similarity = cosine_similarity(query, value[1])
            list_similarity.append((id_, similarity))

    return sorted(list_similarity, key=lambda x: x[1], reverse=True)

In [85]:
cosine_similarity(query_embedding, id_dict[17307][0][1])

array([[0.58106863]], dtype=float32)

In [108]:
cosine_similarity_with_id(query_embedding, id_dict)[:30]

[(17728, array([[0.7069239]], dtype=float32)),
 (18213, array([[0.6918032]], dtype=float32)),
 (17368, array([[0.68622184]], dtype=float32)),
 (17629, array([[0.68490994]], dtype=float32)),
 (18259, array([[0.68436134]], dtype=float32)),
 (17464, array([[0.684266]], dtype=float32)),
 (17941, array([[0.6784443]], dtype=float32)),
 (18243, array([[0.677635]], dtype=float32)),
 (17571, array([[0.67690897]], dtype=float32)),
 (17556, array([[0.6762433]], dtype=float32)),
 (17734, array([[0.67554784]], dtype=float32)),
 (17851, array([[0.67484605]], dtype=float32)),
 (18177, array([[0.67329407]], dtype=float32)),
 (17851, array([[0.6727227]], dtype=float32)),
 (17741, array([[0.6721135]], dtype=float32)),
 (17731, array([[0.6717809]], dtype=float32)),
 (18018, array([[0.6713518]], dtype=float32)),
 (17695, array([[0.6704113]], dtype=float32)),
 (18114, array([[0.66989475]], dtype=float32)),
 (17563, array([[0.6687263]], dtype=float32)),
 (18068, array([[0.6679863]], dtype=float32)),
 (18213

Embed full text

In [99]:
def get_tokenized_and_embedded(data, tokenizer, model):
    id_dict = {}

    for position in range(len(data)):
        # Tokenize
        encoding = tokenizer.encode_plus(data['article'][position],
                                         add_special_tokens=True,
                                         truncation=True,
                                         padding='longest',
                                         return_tensors='pt',
                                         )
        inputs = encoding['input_ids']
        
        # Get BERT model embeddings
        with torch.no_grad():
            outputs = model(inputs, attention_mask=encoding['attention_mask'])
            
        # Get embeddings from BERT output
        embeddings = outputs.last_hidden_state
        
        # Aggregate token-level embeddings (e.g., mean pooling)
        aggregated_embedding = torch.mean(embeddings, dim=1)

        # Add to dictionary
        id_dict[data['id'][position]] = aggregated_embedding

    return id_dict

In [110]:
def get_cosine_similarity(query, context):
    list_similarity = []
    for id_, text_embedd in context.items():
        similarity = cosine_similarity(query, text_embedd)
        list_similarity.append((id_, similarity))

    return sorted(list_similarity, key=lambda x: x[1], reverse=True)

In [111]:
id_list = get_tokenized_and_embedded(data, tokenizer, model)

In [112]:
id_list[17307]

tensor([[-1.0826e-01,  1.5632e-01,  2.5611e-01, -2.2191e-02,  2.6427e-01,
         -1.1113e-01, -8.9729e-02,  5.7650e-01, -1.2993e-01, -1.3186e-01,
         -1.7976e-02, -2.4482e-01, -3.1864e-01,  3.7573e-01, -4.4931e-02,
          5.5026e-01,  2.3220e-01, -8.4037e-02, -2.1442e-01,  5.3522e-01,
          2.8642e-01, -9.2983e-02,  1.6004e-01,  6.8769e-01,  2.9719e-01,
         -4.0378e-03, -7.9695e-02,  1.6915e-01, -3.6810e-01, -7.0673e-02,
          4.1963e-01, -3.5793e-02, -2.4077e-01, -4.1164e-01, -9.1535e-03,
          3.7298e-02, -4.0273e-03, -2.0717e-01, -2.8055e-02,  2.8056e-01,
         -5.0087e-01, -3.7250e-01, -1.9589e-01,  8.7793e-02, -2.9733e-01,
         -1.0532e-01,  4.3566e-01,  2.1728e-01,  5.7760e-02, -1.8717e-01,
         -3.6283e-01,  2.0059e-01,  3.6989e-02, -7.5959e-02,  3.6083e-01,
          6.4852e-01, -2.6122e-01, -3.1875e-01, -4.6161e-01, -4.0975e-01,
          1.3257e-01, -9.1175e-02,  9.1371e-02, -4.3389e-01,  1.0564e-01,
          2.5243e-01, -7.5565e-02,  1.

In [118]:
get_cosine_similarity(query_embedding, id_list)[:30]

[(17980, array([[0.66054296]], dtype=float32)),
 (18213, array([[0.655092]], dtype=float32)),
 (17619, array([[0.6510023]], dtype=float32)),
 (18303, array([[0.6490078]], dtype=float32)),
 (17312, array([[0.64875615]], dtype=float32)),
 (17638, array([[0.64840347]], dtype=float32)),
 (17414, array([[0.64800763]], dtype=float32)),
 (17497, array([[0.64688313]], dtype=float32)),
 (17728, array([[0.6466852]], dtype=float32)),
 (17940, array([[0.6465748]], dtype=float32)),
 (17764, array([[0.6464273]], dtype=float32)),
 (18068, array([[0.64614534]], dtype=float32)),
 (17970, array([[0.645794]], dtype=float32)),
 (17629, array([[0.64481854]], dtype=float32)),
 (17734, array([[0.6445072]], dtype=float32)),
 (17888, array([[0.6443051]], dtype=float32)),
 (17574, array([[0.6439115]], dtype=float32)),
 (18391, array([[0.6435727]], dtype=float32)),
 (18276, array([[0.64287126]], dtype=float32)),
 (18420, array([[0.64269054]], dtype=float32)),
 (17354, array([[0.64189625]], dtype=float32)),
 (178

### Preprocess before BERT

In [128]:
# Create a function to clean data
def clean_text(text):
    # Remove non-ASCII characters
    text = ''.join([char for char in text if ord(char) < 128])

    # Lowercase
    text = text.lower()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]
    text = ' '.join(words)

    # Remove numbers
    text = re.sub(r'[0-9]', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove single character
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Lemmatize
    text = ' '.join(tokens)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]

    return ' '.join(tokens)

In [129]:
data.loc[:,'article_cleaned'] = data['article'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:,'article_cleaned'] = data['article'].apply(clean_text)


In [132]:
data_cleaned = data[['id', 'article_cleaned']]
data_cleaned.rename(columns={'article_cleaned': 'article'}, inplace=True)
data_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned.rename(columns={'article_cleaned': 'article'}, inplace=True)


Unnamed: 0,id,article
0,17307,paris islamic state drive ancient city palmyra...
1,17292,angel everywhere muiz familys apartment bronx ...
2,17298,finally second avenue subway open new york cit...
3,17311,washington its time republicans tumultuous dec...
4,17339,megyn kelly shift fox news nbc host daily dayt...


In [133]:
id_list_cleaned = get_tokenized_and_embedded(data_cleaned, tokenizer, model)

In [137]:
query = clean_text(query)

# Tokenize
encoding = tokenizer.encode_plus(query, padding=True, return_tensors='pt')

# Get BERT model embeddings
with torch.no_grad():
    output = model(encoding['input_ids'], attention_mask=encoding['attention_mask'])

# Get embedding
query_embedding = output.last_hidden_state.mean(dim=1)


In [141]:
get_cosine_similarity(query_embedding, id_list_cleaned)[:30]

[(17929, array([[0.4088353]], dtype=float32)),
 (17976, array([[0.39024156]], dtype=float32)),
 (18067, array([[0.38874054]], dtype=float32)),
 (17779, array([[0.38761872]], dtype=float32)),
 (18293, array([[0.38514116]], dtype=float32)),
 (18364, array([[0.3767106]], dtype=float32)),
 (18273, array([[0.3765701]], dtype=float32)),
 (17807, array([[0.37559986]], dtype=float32)),
 (18213, array([[0.37556866]], dtype=float32)),
 (18240, array([[0.3749548]], dtype=float32)),
 (17515, array([[0.3736099]], dtype=float32)),
 (18262, array([[0.371885]], dtype=float32)),
 (18410, array([[0.37058473]], dtype=float32)),
 (18413, array([[0.36887658]], dtype=float32)),
 (18297, array([[0.36682183]], dtype=float32)),
 (17335, array([[0.36463523]], dtype=float32)),
 (17371, array([[0.36447257]], dtype=float32)),
 (17398, array([[0.36398777]], dtype=float32)),
 (18128, array([[0.36247224]], dtype=float32)),
 (17354, array([[0.3621183]], dtype=float32)),
 (18401, array([[0.36137396]], dtype=float32)),
