In [1]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


In [None]:
def predict_term_weights( query):
    """Predict term weights for the given query using BERT embeddings."""
    # Tokenize and encode the query for BERT
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    # print(inputs)
    # Get embeddings from BERT model
    with torch.no_grad():
        outputs = model(**inputs)
        # Use the last hidden state
        last_hidden_states = outputs.last_hidden_state
    # print(len(last_hidden_states[0][0]))
    # print((last_hidden_states[0][0]))
    # Simplified weight prediction logic: Use the norm of the embeddings as weights
    # This is a placeholder and should be replaced with a proper mechanism
    weights = torch.norm(last_hidden_states, dim=-1).squeeze().tolist()
    # print(weights)

    # Associate weights with tokens. This assumes no special tokens (CLS, SEP) for simplicity.
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    return dict(zip(tokens, weights))
print()

In [19]:
from rank_bm25 import BM25Okapi

corpus = [
    "Hello there good man!",
    "Hello there good man!",
    "It is quite windy in London",
    "hello how is the weather today?"
]

tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [12]:
query = "hello man"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)

In [13]:
bm25.get_top_n(tokenized_query, corpus, n=1)

['hello how is the weather today?']

In [28]:
import pandas as pd
import numpy as np
import os 
import re
import operator
import nltk 
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import operator

In [29]:
news = pd.read_json('https://raw.githubusercontent.com/zayedrais/DocumentSearchEngine/master/data/newsgroups.json')
news

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space
...,...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13,sci.med
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4,comp.sys.mac.hardware
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3,comp.sys.ibm.pc.hardware
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1,comp.graphics


In [30]:
for i,txt in enumerate(news['content']):
    subject = re.findall('Subject:(.*\n)',txt)
    if (len(subject) !=0):
        news.loc[i,'Subject'] =str(i)+' '+subject[0]
    else:
        news.loc[i,'Subject'] ='NA'
df_news =news[['Subject','content']]
df_news

Unnamed: 0,Subject,content
0,0 WHAT car is this!?\n,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,1 SI Clock Poll - Final Call\n,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,2 PB questions...\n,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,3 Re: Weitek P9000 ?\n,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,4 Re: Shuttle Launch Question\n,From: jcm@head-cfa.harvard.edu (Jonathan McDow...
...,...,...
11309,11309 Re: Migraines and scans\n,From: jim.zisfein@factory.com (Jim Zisfein) \n...
11310,11310 Screen Death: Mac Plus/512\n,From: ebodin@pearl.tufts.edu\nSubject: Screen ...
11311,11311 Mounting CPU Cooler in vertical case\n,From: westes@netcom.com (Will Estes)\nSubject:...
11312,11312 Re: Sphere from 4 points?\n,From: steve@hcrlgw (Steven Collins)\nSubject: ...


In [55]:
df_news.loc[:, 'content'] = df_news.loc[:, 'content'].replace(to_replace='^(from|lines):(.*)\n', value='', regex=True,)
df_news.loc[:, 'content'] = df_news.loc[:, 'content'].replace(to_replace='[!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~]', value=' ', regex=True)
df_news.loc[:, 'content'] = df_news.loc[:, 'content'].replace(to_replace='-', value=' ', regex=True)
df_news.loc[:, 'content'] = df_news.loc[:, 'content'].replace(to_replace='\s+', value=' ', regex=True)
df_news.loc[:, 'content'] = df_news.loc[:, 'content'].apply(lambda x: x.strip())


In [56]:

# Make sure all content is in lowercase
df_news.loc[:, 'content'] = df_news.loc[:, 'content'].apply(lambda x: x.lower())

# Tokenize the content in the 'content' column
df_news.loc[:, 'Word tokenize'] = df_news.loc[:, 'content'].apply(word_tokenize)

In [57]:
df_news


Unnamed: 0,Subject,content,Word tokenize
0,0 WHAT car is this!?\n,from lerxst wam.umd.edu where s my thing subje...,"[from, lerxst, wam.umd.edu, where, s, my, thin..."
1,1 SI Clock Poll - Final Call\n,from guykuo carson.u.washington.edu guy kuo su...,"[from, guykuo, carson.u.washington.edu, guy, k..."
2,2 PB questions...\n,from twillis ec.ecn.purdue.edu thomas e willis...,"[from, twillis, ec.ecn.purdue.edu, thomas, e, ..."
3,3 Re: Weitek P9000 ?\n,from jgreen amber joe green subject re weitek ...,"[from, jgreen, amber, joe, green, subject, re,..."
4,4 Re: Shuttle Launch Question\n,from jcm head cfa.harvard.edu jonathan mcdowel...,"[from, jcm, head, cfa.harvard.edu, jonathan, m..."
...,...,...,...
11309,11309 Re: Migraines and scans\n,from jim.zisfein factory.com jim zisfein subje...,"[from, jim.zisfein, factory.com, jim, zisfein,..."
11310,11310 Screen Death: Mac Plus/512\n,from ebodin pearl.tufts.edu subject screen dea...,"[from, ebodin, pearl.tufts.edu, subject, scree..."
11311,11311 Mounting CPU Cooler in vertical case\n,from westes netcom.com will estes subject moun...,"[from, westes, netcom.com, will, estes, subjec..."
11312,11312 Re: Sphere from 4 points?\n,from steve hcrlgw steven collins subject re sp...,"[from, steve, hcrlgw, steven, collins, subject..."


In [58]:

from sklearn.feature_extraction.text import TfidfVectorizer
import operator
## Create Vocabulary
vocabulary = set()
for doc in df_news.content:
    vocabulary.update(doc.split(','))
vocabulary = list(vocabulary)
# Intializating the tfIdf model
tfidf = TfidfVectorizer(vocabulary=vocabulary)
# Fit the TfIdf model
tfidf.fit(df_news.content)
# Transform the TfIdf model
tfidf_tran=tfidf.transform(df_news.content)

In [59]:
def gen_vector_T(tokens):
    Q = np.zeros((len(vocabulary)))    
    x= tfidf.transform(tokens)
    #print(tokens[0].split(','))
    for token in tokens[0].split(','):
        #print(token)
        try:
            ind = vocabulary.index(token)
            Q[ind]  = x[0, tfidf.vocabulary_[token]]
        except:
            pass
    return Q

In [60]:
def wordLemmatizer(data):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    file_clean_k =pd.DataFrame()
    for index,entry in enumerate(data):
        
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if len(word)>1 and word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
            # The final processed set of words for each iteration will be stored in 'text_final'
                file_clean_k.loc[index,'Keyword_final'] = str(Final_words)
                file_clean_k.loc[index,'Keyword_final'] = str(Final_words)
                file_clean_k=file_clean_k.replace(to_replace ="\[.", value = '', regex = True)
                file_clean_k=file_clean_k.replace(to_replace ="'", value = '', regex = True)
                file_clean_k=file_clean_k.replace(to_replace =" ", value = '', regex = True)
                file_clean_k=file_clean_k.replace(to_replace ='\]', value = '', regex = True)
    return file_clean_k


In [71]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(float(np.linalg.norm(a)*np.linalg.norm(b)))
    # print("asa")

    
    return cos_sim
def cosine_similarity_T(k, query):
    preprocessed_query  = re.sub("\W+", " ", query).strip()
    tokens = word_tokenize(str(preprocessed_query))
    q_df = pd.DataFrame(columns=['q_clean'])
    q_df.loc[0,'q_clean'] =tokens
    q_df['q_clean'] =wordLemmatizer(q_df.q_clean)
    d_cosines = []
    
    query_vector = gen_vector_T(q_df['q_clean'])
    for d in tfidf_tran.A:
        d_cosines.append(cosine_sim(query_vector, d))
                    
    out = np.array(d_cosines).argsort()[-k:][::-1]
    #print("")
    d_cosines.sort()
    a = pd.DataFrame()
    for i,index in enumerate(out):
        a.loc[i,'index'] = str(index)
        a.loc[i,'Subject'] = df_news['Subject'][index]
    for j,simScore in enumerate(d_cosines[-k:][::-1]):
        a.loc[j,'Score'] = simScore
    return a

In [72]:
cosine_similarity_T(10,"computer science")

  cos_sim = np.dot(a, b)/(float(np.linalg.norm(a)*np.linalg.norm(b)))


Unnamed: 0,index,Subject,Score
0,11313,11313 stolen CBR900RR\n,
1,3712,3712 Re: Drinking and Riding\n,
2,3776,3776 RE: was:Go Hezbollah!\n,
3,3775,3775 Re: Analog switches/Balan\n,
4,3774,3774 Re: A question that has bee bothering me.\n,
5,3773,"3773 Re: After 2000 years, can we say that Ch...",
6,3772,3772 --- CR-ROM Drive Recommendation? ---\n,
7,3771,3771 Re: Windows Help\n,
8,3770,3770 Re: AF/ATS: Red Army Fraction (RAF) comm...,
9,3769,3769 Re: With a surge in the last two weeks...\n,


In [46]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/praveenlawyantra/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:
import torch
from transformers import BertModel, BertTokenizer
from rank_bm25 import BM25Okapi
import torch




In [5]:
s

AttributeError: 'NoneType' object has no attribute 'get_scores'

In [34]:
from rank_bm25 import BM25Okapi
from transformers import BertTokenizer, BertModel
import torch
from collections import defaultdict

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# document1 = "Machine learning teaches machine how to learn"
# document2 = "i am apple"
# document3 = "mango i like learning"
# document4 = "learning is shit"
# docs = [document1, document2, document3, document4]


sentences = [
    "Despite the heavy rain that morning, the team decided to proceed with the outdoor charity event, setting up tents and arranging the venue with unwavering determination, hoping their efforts would raise significant funds for the local animal shelter.",
    "As the sun dipped below the horizon, casting a golden hue across the sky, Maria stood at the edge of the cliff, reflecting on the decisions that had led her to this moment of solitude, feeling a mix of apprehension and excitement about the future.",
    "The ancient library, with its towering shelves laden with dusty tomes and manuscripts, whispered secrets of a bygone era to those who dared to explore its depths, offering insights into worlds long vanished and knowledge that had been sought by scholars for centuries.",
    "In the heart of the bustling city, amidst the cacophony of honking cars, chattering pedestrians, and the occasional blare of sirens, there existed a small, serene park where one could escape the relentless pace of urban life and find a moment of tranquility.",
    "The groundbreaking ceremony, attended by local dignitaries, esteemed guests, and community members, marked the commencement of the construction of the new community center, envisioned as a beacon of hope and unity for the neighborhood's diverse population.",
    "Eleanor, an accomplished violinist, took to the stage with a grace born of years of practice, her fingers dancing over the strings as she poured her soul into the performance, captivating the audience with a melody that spoke of love, loss, and redemption.",
    "Amidst the chaos of the battlefield, the young soldier found a moment of clarity, realizing the true cost of war not in terms of territory gained or lost, but in the human lives irrevocably changed by the horrors they had witnessed and the sacrifices they had made.",
    "The chef, a master of his craft, meticulously combined the freshest local ingredients with exotic spices brought from distant lands, creating a dish that was not only a feast for the palate but also a celebration of the diverse cultures that had influenced his culinary journey.",
    "As the debate raged on, the professor, armed with decades of research and an unwavering belief in the importance of preserving ancient languages, defended their value not merely as academic curiosities but as vital links to our collective human heritage and windows into the minds of our ancestors.",
    "The novelist, after years of grappling with writer's block, found inspiration in the least expected of places—a small, unremarkable café where the mosaic of human interactions and the simplicity of everyday moments sparked the idea for her next best-selling novel, weaving together themes of love, resilience, and the beauty of the mundane."
]

# sentences

def tokenize_documents(documents):
    tokenized_docs = [tokenizer.tokenize(doc) for doc in documents]
    bm25 = BM25Okapi(tokenized_docs)
    return bm25

def predict_term_weights(query):
    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    # Placeholder for calculating weights. You need to define this part.
    weights = torch.mean(last_hidden_states, dim=1).squeeze().numpy()  # Simplified example
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    return dict(zip(tokens, weights))

def search(docs,query):
    bm25 = tokenize_documents(docs)
    term_weights = predict_term_weights(query)
    weighted_query = [(term, weight) for term, weight in term_weights.items()]
    
    scores = defaultdict(float)
    for term, weight in weighted_query:
        term_scores = bm25.get_scores([term])
        for doc_id, score in enumerate(term_scores):
            scores[doc_id] += score * weight
    sorted_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [docs[doc_id] for doc_id, _ in sorted_docs]

# Example usage



In [35]:
result = search(sentences,"manuscripts")
print(result)
for i in result:
    print()
    print(i)

['The ancient library, with its towering shelves laden with dusty tomes and manuscripts, whispered secrets of a bygone era to those who dared to explore its depths, offering insights into worlds long vanished and knowledge that had been sought by scholars for centuries.', 'Despite the heavy rain that morning, the team decided to proceed with the outdoor charity event, setting up tents and arranging the venue with unwavering determination, hoping their efforts would raise significant funds for the local animal shelter.', 'As the sun dipped below the horizon, casting a golden hue across the sky, Maria stood at the edge of the cliff, reflecting on the decisions that had led her to this moment of solitude, feeling a mix of apprehension and excitement about the future.', 'In the heart of the bustling city, amidst the cacophony of honking cars, chattering pedestrians, and the occasional blare of sirens, there existed a small, serene park where one could escape the relentless pace of urban li