In [2]:
import pandas as pd
import gensim as gs
import numpy as np
import pickle
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import Stemmer
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm

from gensim.corpora import Dictionary
from gensim.models import TfidfModel, OkapiBM25Model
from gensim.similarities import SparseMatrixSimilarity

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Stamina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Read dataset, original to be used in BERT - processed to be used in BM25

original_df = pd.read_pickle("processed_dataset/dataset_qdpairs_raw.pkl")
processed_df = pd.read_pickle("processed_dataset/dataset_qdpairs_processed.pkl")

In [4]:
# Load up BM25 pickles

corpus = pickle.load(open('gensim_bm25_pickles/corpus.pkl', 'rb'))
dictionary = pickle.load(open('gensim_bm25_pickles/dictionary.pkl', 'rb'))
bm25_model = pickle.load(open('gensim_bm25_pickles/bm25_model.pkl', 'rb'))
bm25_corpus = pickle.load(open('gensim_bm25_pickles/bm25_corpus.pkl', 'rb'))
bm25_index = pickle.load(open('gensim_bm25_pickles/bm25_index.pkl', 'rb'))

In [5]:
# Load up BERTurk model and pre-computed document embeddings

model = SentenceTransformer('emrecan/bert-base-turkish-cased-mean-nli-stsb-tr')
model.max_seq_length = 512
model.to('cuda')

doc_embeddings = np.load('doc_embeddings_npy/doc_embeddings.npy')

In [6]:
bm25_index.index

<275817x282207 sparse matrix of type '<class 'numpy.float32'>'
	with 92437456 stored elements in Compressed Sparse Row format>

In [18]:
from sklearn.preprocessing import MinMaxScaler
# Define the function for preprocessing query for BM25

def preprocess_str(str_to_process):
    # This is a turkish stemmer, doesn't work perfect but it is consistent at least
    stemmer = Stemmer.Stemmer('turkish')
    
    str_result = str_to_process
    # Remove non-chars
    str_result = gs.parsing.preprocessing.strip_multiple_whitespaces(gs.parsing.preprocessing.strip_numeric(gs.parsing.preprocessing.strip_non_alphanum(str_result)))
    # Lowercase str
    str_result = str_result.lower()
    # Remove stopwords
    str_result = gs.parsing.preprocessing.remove_stopwords(s=str_result, stopwords=stopwords.words("turkish"))
    # Split str
    str_result = str_result.split()
    # Stem words
    str_result = stemmer.stemWords(str_result)

    return str_result

# Define functions for scoring each document given query

def score(query, is_index = False):

    query_bm25 = query if not is_index else processed_df.loc[query].query
    query_bert = query if not is_index else original_df.loc[query].query

    bm25_scores = score_BM25(query_bm25, not is_index)
    bert_scores = score_BERT(query_bert)

    bm25_scores = [score for score, _ in bm25_scores]
    bert_scores = [score for score, _ in bert_scores]

    # Normalize the scores using MinMaxScaler
    scaler = MinMaxScaler()
    bm25_scores_normalized = scaler.fit_transform(np.array(bm25_scores).reshape(-1, 1))
    bert_scores_normalized = scaler.fit_transform(np.array(bert_scores).reshape(-1, 1))

    # Average the normalized scores
    average_scores = 1 * bm25_scores_normalized + 0 * bert_scores_normalized

    # Combine the average scores with the document indices
    final_scores = sorted(zip(average_scores.flatten(), processed_df.index), reverse=True)

    return final_scores
    

def score_BM25(query, preprocess = True):
    preprocessed_query = preprocess_str(query) if preprocess else query
    tfidf_model = TfidfModel(dictionary=dictionary, smartirs='bnn')  # Enforce binary weighting of queries
    tfidf_query = tfidf_model[dictionary.doc2bow(preprocessed_query)]
    
    similarities = bm25_index[tfidf_query]

    scores = [(similarity, index) for similarity, index in zip(similarities, processed_df.index)]
    # scores = sorted(zip(similarities, processed_df.index), reverse=True)

    return scores

def score_BERT(query):
    query_embedding = model.encode(query, normalize_embeddings=True)

    similarities = np.dot(doc_embeddings, query_embedding.T)

    scores = [(similarity, index) for similarity, index in zip(similarities, original_df.index)]
    # scores = sorted(zip(similarities, original_df.index), reverse=True)

    return scores