In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import os

### Partition the datasets

Here We still need to cut the English corpus into ten parts, because we will encounter memory crashes during processing, but we can prevent errors caused by splitting the data set by calculating the global idf and vectorizer.

In [None]:
# partition into 10 parts
def split_corpus(corpus, n_splits=10):
    chunk_size = len(corpus) // n_splits
    for i in range(n_splits):
        start = i * chunk_size
        end = (i + 1) * chunk_size if i < n_splits - 1 else len(corpus)
        corpus_chunk = corpus.iloc[start:end]
        corpus_chunk.to_csv(f'Data/english_parts/corpus_part_{i}_new.csv', index=False)
        print(f'Saved corpus part {i} with {len(corpus_chunk)} documents.')


# 读取corpus
corpus = pd.read_csv('Data/preprocess_corpus/bm25corpus_en.csv')
split_corpus(corpus, n_splits=10)


### BM25 Computation

In [2]:
import numpy as np
import pandas as pd
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix


# calculate the average document length of the corpus
def _avg_doc_length(corpus):
    documents = corpus["text_token"]
    return np.mean([len(doc) for doc in documents])


# calculate the global idf and vectorizer
def calculate_global_idf(corpus, max_features=10000):
    vectorizer = CountVectorizer(max_features=max_features)
    doc_term_matrix = vectorizer.fit_transform(corpus["text_token"])
    df = np.count_nonzero(doc_term_matrix.toarray(), axis=0)
    N = len(corpus)
    idf = np.log(1 + (N - df + 0.5) / (df + 0.5))
    return idf, vectorizer

# calculate the BM25 score for a corpus part and save the result
def save_bm25_corpus_part(part_file, output_file, idf, vectorizer, avg_doc_length, k1=1.25, b=0.75):
    corpus_part = pd.read_csv(part_file)
    doc_term_matrix = vectorizer.transform(corpus_part['text_token'])
    
    F = doc_term_matrix.toarray()
    doc_lengths = F.sum(axis=1)
    
    # calculate BM25
    numerator = F * (k1 + 1)
    denominator = F + k1 * (1 - b + b * (doc_lengths[:, None] / avg_doc_length))
    F_adjusted = numerator / denominator
    idf_times_F_adjusted = csr_matrix(idf * F_adjusted)  # change to sparse matrix

    # save the result
    doc_ids = corpus_part['docid'].tolist()
    joblib.dump((idf_times_F_adjusted, doc_ids), output_file)
    print(f'Saved BM25 part to {output_file}')


### Get joblib for corpus file

It's time-consuming, so you can use joblib we provide directly :)

Here we use joblib to efficiently store large sparse matrices.

In this way, we can get 10 joblibs for the 10 corpus parts

In [3]:
# run
# precaution: make sure the corpus is already tokenized
corpus_files = [f'Data/english_corpus/corpus_part_{i}.csv' for i in range(10)]
corpus = pd.concat([pd.read_csv(file) for file in corpus_files])  # corpus_files : list of file names
idf, vectorizer = calculate_global_idf(corpus)
avg_doc_length = _avg_doc_length(corpus)

# 对每个切片应用 BM25 计算
for part_file in corpus_files:
    output_file = f"english_joblib/{part_file.split('.')[0]}_bm25.joblib"
    save_bm25_corpus_part(part_file, output_file, idf, vectorizer, avg_doc_length)


FileNotFoundError: [Errno 2] No such file or directory: 'Data/english_corpus/corpus_part_0.csv'

### Load joblib

suppose you have got the joblibs and vectorizer

In [None]:
import numpy as np
import pandas as pd
import joblib
import string



# get the top n documents for each query
def _bm25_query_part(query, part_file,vectorizer, top_n=10):
    # load the BM25 part
    idf_times_F_adjusted, doc_ids = joblib.load(part_file)

    idf_times_F_adjusted = idf_times_F_adjusted.toarray()
    query_term_matrix = vectorizer.transform(query['query_token'])
    # calculate BM25 scores
    BM25_scores = query_term_matrix @ idf_times_F_adjusted.T
    
    # get the top n documents for each query
    pos_docs = []
    for query_idx in range(query_term_matrix.shape[0]):
        scores = BM25_scores[query_idx]
        top_doc_indices = np.argsort(scores)[-top_n:][::-1]
        pos_docs.extend([(doc_ids[idx], scores[idx]) for idx in top_doc_indices])
    
    return pos_docs

# get the top n documents for each query
def bm25_query(query_file, vectorizer, language, n_splits=10, top_n=10):
    # load the queries
    queries = pd.read_csv(query_file)

    # get the BM25 results for each query
    all_results = {query_id: [] for query_id in queries['query_id']}
    for i in range(n_splits):
        part_file = f'corpus/corpus_part_{i}_bm25.joblib'
        part_results = _bm25_query_part(queries, part_file, vectorizer, top_n=top_n)
        
        # add the results to the final results
        for query_idx, query_id in enumerate(queries['query_id']):
            all_results[query_id].extend(part_results[query_idx * top_n:(query_idx + 1) * top_n])

    # sort the results
    final_results = {}
    for query_id, docs in all_results.items():
        sorted_docs = sorted(docs, key=lambda x: x[1], reverse=True)[:top_n]
        final_results[query_id] = [doc_id for doc_id, score in sorted_docs]

    return final_results


In [None]:
# run
language = "en"  
# load the vectorizer
vectorizer = joblib.load(f'english_joblib/vectorizer_en.joblib')
query_file = ('Data/preprocess_query/bm25_en_query.csv')
pos_docs = bm25_query(query_file, vectorizer, language)

### Evaluation

In [None]:
# Evaluate the results
query = pd.read_csv('Data/dev.csv')
# get the positive documents for each query
query_dict = {}
for i in range(len(query)):
    query_dict[query['query_id'][i]] = query['positive_docs'][i]


acc = 0
# if the retrieved documents are in the positive documents
for key in pos_docs.keys():
    if query_dict[key] in pos_docs[key]:
        acc += 1

# calculate the accuracy
length = len(query[query['lang'] == 'en'])
print("Accuracy: ", acc/length)