In [1]:
import os
import pickle
import copy
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter, defaultdict
from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


Load data

In [None]:
tokenizer = AutoTokenizer.from_pretrained('t5-base', use_fast=True)

In [None]:
# Get oldid2doc
newid2doc, oldid2doc = dict(), dict()
fname = 'NQ_doc_content.tsv' # full
with open(fname, 'r') as f:
    for line in f.readlines():
        docid, _, _, content, _, _, _ = line.split("\t")
        oldid2doc[docid] = content.lower()

print(f'Number of documents: {len(oldid2doc)}')

Number of documents: 109739


In [6]:
Counter(tokenizer.tokenize(oldid2doc['0'])).most_common(10)

[('▁', 656),
 ('.', 161),
 (',', 134),
 ('▁the', 92),
 ('▁email', 78),
 ('▁to', 76),
 ('▁of', 67),
 ("'", 67),
 ('▁and', 60),
 ('a', 58)]

Count TF and IDF

In [None]:
token2df, tokenid2df = {}, {}  # Count DF without truncation

docid2len = {}
rows, cols, vals = [], [], []
max_len = 156 ### Truncating

for i, docid in tqdm(enumerate(oldid2doc.keys()), desc='Making sparse matrix'):
    tokens = tokenizer.tokenize(oldid2doc[docid])[:max_len]
    token_counter = Counter(tokens)
    
    docid2len[docid] = len(tokens)
    
    for token, count in token_counter.items():
        # Get token id
        tokenid = tokenizer.convert_tokens_to_ids(token)
        
        # # Count df
        # if token2df.get(token) is not None:
        #     token2df[token] += 1
        #     tokenid2df[tokenid] += 1
        # else:
        #     token2df[token] = 1
        #     tokenid2df[tokenid] = 1
        
        # Make sparse matrix (TF)
        rows.append(i)
        cols.append(tokenid)
        vals.append(count)

from scipy.sparse import csc_matrix

doc_matrix_csc = csc_matrix((vals, (rows, cols)), shape=(max(rows)+1, len(tokenizer)))
with open('doc_matrix_csc_full_truncate.pkl', 'wb') as f:
    pickle.dump(doc_matrix_csc, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('tokenid2df_full.pkl', 'wb') as f:
    pickle.dump(tokenid2df, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('token2df_full.pkl', 'wb') as f:
    pickle.dump(token2df, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open('doc_matrix_csc_full_truncate.pkl', 'rb') as f:
#     doc_matrix_csc = pickle.load(f)

# with open('tokenid2df_full_uncased.pkl', 'rb') as f:
#     tokenid2df = pickle.load(f)

# with open('token2df_full_uncased.pkl', 'rb') as f:
#     token2df = pickle.load(f)

In [6]:
# with open('doc_matrix_csc_full_uncased.pkl', 'rb') as f:
#     doc_matrix_csc = pickle.load(f)


In [9]:
doc_matrix_csc

<109739x32100 sparse matrix of type '<class 'numpy.int64'>'
	with 8720455 stored elements in Compressed Sparse Column format>

In [10]:
token2df

{'▁email': 1756,
 '▁marketing': 7128,
 '▁is': 108572,
 '▁the': 109665,
 '▁act': 30906,
 '▁of': 109491,
 '▁sending': 5105,
 '▁': 109739,
 'a': 109681,
 '▁commercial': 21854,
 '▁message': 20704,
 ',': 109737,
 '▁typically': 18014,
 '▁to': 108964,
 '▁group': 46344,
 '▁people': 60627,
 '▁using': 92850,
 '.': 109737,
 '▁in': 109339,
 '▁its': 76605,
 '▁broad': 16449,
 'est': 36302,
 '▁sense': 16959,
 '▁every': 34764,
 '▁sent': 21654,
 '▁potential': 16869,
 '▁or': 91442,
 '▁current': 34099,
 '▁customer': 3402,
 '▁could': 45724,
 '▁be': 100195,
 '▁considered': 33804,
 '▁it': 102006,
 '▁usually': 28919,
 '▁involves': 7745,
 '▁send': 12394,
 '▁advertisements': 1967,
 '▁request': 8569,
 '▁business': 31177,
 '▁solicit': 1057,
 '▁sales': 14631,
 '▁donations': 1643,
 '▁and': 109129,
 '▁meant': 11793,
 '▁build': 14157,
 '▁loyalty': 3401,
 '▁trust': 11857,
 '▁brand': 15804,
 '▁awareness': 4625,
 '▁emails': 526,
 '▁can': 83072,
 '▁purchased': 7463,
 '▁lead': 31470,
 '▁list': 62867,
 '▁database': 12580,

In [11]:
doc_matrix_arr = doc_matrix_csc.toarray()

In [12]:
len(tokenid2df), len(token2df)

(21855, 162032)

In [13]:
tokenid2df_copy = copy.deepcopy(tokenid2df)

# Get dl
dl = doc_matrix_csc.tocsr().sum(axis=1).A1
avgdl = np.mean(dl)
print(f"Average document length: {avgdl}")

# Get idf vector
tokenid2df_copy = copy.deepcopy(tokenid2df)
idf_vector = np.zeros((len(tokenizer))) # [|V|,]
print(idf_vector.shape)

for i, tokenid in tqdm(enumerate(tokenid2df_copy.keys())):
    # smooth idf
    idf_val = np.log(1 + (len(oldid2doc) - tokenid2df_copy[tokenid] + 0.5) / (tokenid2df_copy[tokenid] + 0.5))
    
    # idf_val = np.log(len(oldid2doc)/(tokenid2df_copy[tokenid]+1)) + 1
    idf_vector[tokenid] = idf_val
    
    # Debug
    if i < 10:
        print(tokenizer.convert_ids_to_tokens(tokenid), tokenid, idf_val)
        
# with open('idf_vector_full.pkl', 'wb') as f:
#     pickle.dump(idf_vector, f, protocol=pickle.HIGHEST_PROTOCOL)

Average document length: 155.9924001494455
(32100,)


21855it [00:00, 387868.50it/s]

▁email 791 4.134790738907085
▁marketing 1070 2.7340130980189508
▁is 19 0.010695779033405732
▁the 8 0.0006791078880264866
▁act 1810 1.267147413861793
▁of 13 0.0022670109580706014
▁sending 5657 3.067795541617374
▁ 3 4.556234181320757e-06
a 9 0.0005332203214916177
▁commercial 1328 1.6137070816371988





Get the most scored tokens from each document using TF, IDF


In [None]:
docid2topk = {}
docid2topkids = {}
topk = 50

# TODO Remove stop words
for i, docid in tqdm(enumerate(oldid2doc.keys()), desc='Getting topk tokens'):
    term_score = doc_matrix_arr[i] * idf_vector
    top_tokens = tokenizer.convert_ids_to_tokens(term_score.argsort()[-topk:][::-1])
    
    
    top_tokens_clean = []
    for t in top_tokens:
        if not any(char.isalpha() for char in t):
            continue
        if t in tokenizer.all_special_tokens:
            continue
        top_tokens_clean.append(t)
    docid2topk[docid] = top_tokens_clean

    top_tokenids_clean = [tokenizer.convert_tokens_to_ids(token) for token in top_tokens_clean]
    docid2topkids[docid] = top_tokenids_clean

Getting topk tokens: 109739it [07:09, 255.60it/s]


In [15]:
docid2topk['2'][:10], docid2topkids['2'][:10]

(['▁fertiliz',
  'ulation',
  '▁egg',
  '▁human',
  '▁sequence',
  '▁initi',
  'ation',
  '▁vitr',
  '▁finishes',
  '▁fertil'],
 [20617, 7830, 6182, 936, 5932, 19350, 257, 11614, 13084, 20859])

In [16]:
docid2topk_tfidf, docid2topkids_tfidf = copy.deepcopy(docid2topk), copy.deepcopy(docid2topkids)

Get the most scored tokens from each document using BM25

In [None]:
docid2topk = {}
docid2topkids = {}
topk = 50

k1, b = 100, 0.75

for i, docid in tqdm(enumerate(oldid2doc.keys()), desc='Getting topk tokens'):
    bm25_term_score = doc_matrix_arr[i] * (idf_vector * (k1 + 1)) / (doc_matrix_arr[i] + k1 * (1 - b + b * dl[i] / avgdl))    
    top_tokens = tokenizer.convert_ids_to_tokens(bm25_term_score.argsort()[-topk:][::-1])
    
    top_tokens_clean = []
    for t in top_tokens:
        if not any(char.isalpha() for char in t):
            continue
        if t in tokenizer.all_special_tokens:
            continue
        top_tokens_clean.append(t)
    docid2topk[docid] = top_tokens_clean

    top_tokenids_clean = [tokenizer.convert_tokens_to_ids(token) for token in top_tokens_clean]
    docid2topkids[docid] = top_tokenids_clean

Getting topk tokens: 109739it [09:15, 197.65it/s]


Check collision

In [18]:
docid2newid_token = defaultdict(list)
docid2newid = defaultdict(list)

docid2topkids = {k: [str(v) for v in docid2topkids[k]] for k in docid2topkids.keys()}

for topk in range(7, 13):
    newids = []
    for i, docid in enumerate(docid2topk.keys()):
        newid_token = '-'.join(docid2topk[docid][:topk])
        docid2newid_token[docid].append(newid_token)
        
        newid = '-'.join(docid2topkids[docid][:topk])
        docid2newid[docid].append(newid)
        
        newids.append(newid_token)
    newids_cnt = Counter(newids)
    print(f"Max num of collision and index of collision: {max(newids_cnt.values())}, {newids_cnt.most_common(1)[0][0]}")
    print(f"Topk {topk} | Number of unique newids: {len(set(newids))} ({(len(set(newids)) / len(newids)) * 100:.1f}%)")

Max num of collision and index of collision: 26, ▁leader-▁president-tempo-▁minority-▁majority-▁hatch-▁congressional
Topk 7 | Number of unique newids: 109148 (99.5%)
Max num of collision and index of collision: 26, ▁leader-▁president-tempo-▁minority-▁majority-▁hatch-▁congressional-▁speaker
Topk 8 | Number of unique newids: 109214 (99.5%)
Max num of collision and index of collision: 22, ▁leader-▁president-tempo-▁minority-▁majority-▁hatch-▁congressional-▁speaker-chuck
Topk 9 | Number of unique newids: 109272 (99.6%)
Max num of collision and index of collision: 22, ▁leader-▁president-tempo-▁minority-▁majority-▁hatch-▁congressional-▁speaker-chuck-▁districts
Topk 10 | Number of unique newids: 109304 (99.6%)
Max num of collision and index of collision: 18, ▁leader-▁president-tempo-▁minority-▁majority-▁hatch-▁congressional-▁speaker-chuck-▁districts-▁representatives
Topk 11 | Number of unique newids: 109339 (99.6%)
Max num of collision and index of collision: 17, yana-hak-chip-aka-ha-riya-atta-

Add index to duplicated ids

In [21]:
docid2newid_unique = dict()

for topk in range(7, 8):

    tmp_newids = []
    
    # Check if duplicated
    id_cnt_all = defaultdict(int)
    for k, v in docid2topkids.items():
        tmp_id = '-'.join(v[:topk])
        id_cnt_all[tmp_id] += 1
        

    # Assign new id
    id_cnt_now = defaultdict(int)
    for k, v in docid2topkids.items():
        tmp_id = '-'.join(v[:topk])

        if k not in docid2newid_unique:
            docid2newid_unique[k] = []
        
        # No duplicate ids
        if id_cnt_all[tmp_id] == 1:
            newid = tmp_id
            tmp_newids.append(tmp_id)
        
        # duplicate ids
        else:
            newid = tmp_id + f"-{id_cnt_now[tmp_id]}"
            tmp_newids.append(newid)

        docid2newid_unique[k].append(newid)
        id_cnt_now[tmp_id] += 1

    # Check uniqueness
    print(f"Topk {topk} | Number of unique newids: {len(set(tmp_newids))} ({(len(set(tmp_newids)) / len(tmp_newids)) * 100:.1f}%)")

Topk 7 | Number of unique newids: 109739 (100.0%)


Write files

In [23]:
src_dir = "../NQ_dataset"
dst_dir = "../NQ_dataset_bm25_truncate_2/"

if not os.path.exists(dst_dir):
    os.mkdir(dst_dir)

fnames = ['nq_train_doc_newid.tsv', 'nq_dev_doc_newid.tsv', 'NQ_512_qg.tsv', 'nq_title_abs.tsv', 'NQ_doc_aug.tsv', 'NQ_ids2.tsv']

for filename in fnames:
    fname = os.path.join(src_dir, filename)
    newfname = os.path.join(dst_dir, filename)

    print(f">> {fname} processing ...")

    df = pd.read_csv(fname, encoding='utf-8', sep='\t', dtype={'query': str, 'queryid': str, 'oldid': str})
    print(f"> shape: {df.shape}")

    selected_cols = ['query', 'queryid', 'oldid'] if filename != 'NQ_ids2.tsv' else ['oldid']
    
    df = df[selected_cols]
    id_class = 't5_bm25_truncate_'
    for topk in range(7, 8):
        docid2newid_unique_topk = {k: v[topk-7] for k, v in docid2newid_unique.items()}
        df[id_class + str(topk)] = df['oldid'].map(docid2newid_unique_topk)
        
    
    # Count if NaN exists
    cnt = df.isnull().sum().sum()
    print(f">> NaN count: {cnt}")
            
    df.to_csv(newfname, sep='\t', index=False, encoding='utf-8')
    print(f">> {fname} saved to {newfname}")

    df.head()

>> ../NQ_dataset/nq_train_doc_newid.tsv processing ...
> shape: (307373, 21)
>> NaN count: 0
>> ../NQ_dataset/nq_train_doc_newid.tsv saved to ../NQ_dataset_bm25_truncate_2/nq_train_doc_newid.tsv
>> ../NQ_dataset/nq_dev_doc_newid.tsv processing ...
> shape: (7830, 20)
>> NaN count: 0
>> ../NQ_dataset/nq_dev_doc_newid.tsv saved to ../NQ_dataset_bm25_truncate_2/nq_dev_doc_newid.tsv
>> ../NQ_dataset/NQ_512_qg.tsv processing ...
> shape: (1774887, 20)
>> NaN count: 2
>> ../NQ_dataset/NQ_512_qg.tsv saved to ../NQ_dataset_bm25_truncate_2/NQ_512_qg.tsv
>> ../NQ_dataset/nq_title_abs.tsv processing ...
> shape: (315146, 20)
>> NaN count: 18
>> ../NQ_dataset/nq_title_abs.tsv saved to ../NQ_dataset_bm25_truncate_2/nq_title_abs.tsv
>> ../NQ_dataset/NQ_doc_aug.tsv processing ...
> shape: (1096198, 20)
>> NaN count: 1
>> ../NQ_dataset/NQ_doc_aug.tsv saved to ../NQ_dataset_bm25_truncate_2/NQ_doc_aug.tsv
>> ../NQ_dataset/NQ_ids2.tsv processing ...
> shape: (109739, 19)
>> NaN count: 0
>> ../NQ_dataset/

Write files with docid2newid_token_unique into token


In [24]:
# Add index to duplicated ids
docid2newid_token_unique = dict()

for topk in range(7, 8):

    tmp_newids = []
    
    # Check if duplicated
    id_cnt_all = defaultdict(int)
    for k, v in docid2newid_token.items():
        tmp_id = v[topk-7]
        id_cnt_all[tmp_id] += 1
        

    # Assign new id
    id_cnt_now = defaultdict(int)
    for k, v in docid2newid_token.items():
        tmp_id = v[topk-7]

        if k not in docid2newid_token_unique:
            docid2newid_token_unique[k] = []
        
        # No duplicate ids
        if id_cnt_all[tmp_id] == 1:
            newid = tmp_id
            tmp_newids.append(tmp_id)
        
        # duplicate ids
        else:
            newid = tmp_id + f"-{id_cnt_now[tmp_id]}"
            tmp_newids.append(newid)

        docid2newid_token_unique[k].append(newid)
        id_cnt_now[tmp_id] += 1

    # Check uniqueness
    print(f"Topk {topk} | Number of unique newids: {len(set(tmp_newids))} ({(len(set(tmp_newids)) / len(tmp_newids)) * 100:.1f}%)")


Topk 7 | Number of unique newids: 109739 (100.0%)


In [25]:
src_dir = "../NQ_dataset"
dst_dir = "../NQ_dataset_bm25_lexical_truncate_2/"

if not os.path.exists(dst_dir):
    os.makedirs(dst_dir)

fnames = ['nq_train_doc_newid.tsv', 'nq_dev_doc_newid.tsv', 'NQ_512_qg.tsv', 'nq_title_abs.tsv', 'NQ_doc_aug.tsv', 'NQ_ids2.tsv']

for filename in fnames:
    fname = os.path.join(src_dir, filename)
    newfname = os.path.join(dst_dir, filename)

    print(f">> {fname} processing ...")

    df = pd.read_csv(fname, encoding='utf-8', sep='\t', dtype={'query': str, 'queryid': str, 'oldid': str})
    print(f"> shape: {df.shape}")
    
    selected_cols = ['query', 'queryid', 'oldid'] if filename != 'NQ_ids2.tsv' else ['oldid']
 
    df = df[selected_cols]
    id_class = 't5_bm25_truncate_'
    for topk in range(7, 8):
        docid2newid_unique_topk = {k: v[topk-7] for k, v in docid2newid_token_unique.items()}
        df[id_class + str(topk)] = df['oldid'].map(docid2newid_unique_topk)
            
    df.to_csv(newfname, sep='\t', index=False, encoding='utf-8')
    print(f">> {fname} saved to {newfname}")

    df.head()

>> ../NQ_dataset/nq_train_doc_newid.tsv processing ...
> shape: (307373, 21)
>> ../NQ_dataset/nq_train_doc_newid.tsv saved to ../NQ_dataset_bm25_lexical_truncate_2/nq_train_doc_newid.tsv
>> ../NQ_dataset/nq_dev_doc_newid.tsv processing ...
> shape: (7830, 20)
>> ../NQ_dataset/nq_dev_doc_newid.tsv saved to ../NQ_dataset_bm25_lexical_truncate_2/nq_dev_doc_newid.tsv
>> ../NQ_dataset/NQ_512_qg.tsv processing ...
> shape: (1774887, 20)
>> ../NQ_dataset/NQ_512_qg.tsv saved to ../NQ_dataset_bm25_lexical_truncate_2/NQ_512_qg.tsv
>> ../NQ_dataset/nq_title_abs.tsv processing ...
> shape: (315146, 20)
>> ../NQ_dataset/nq_title_abs.tsv saved to ../NQ_dataset_bm25_lexical_truncate_2/nq_title_abs.tsv
>> ../NQ_dataset/NQ_doc_aug.tsv processing ...
> shape: (1096198, 20)
>> ../NQ_dataset/NQ_doc_aug.tsv saved to ../NQ_dataset_bm25_lexical_truncate_2/NQ_doc_aug.tsv
>> ../NQ_dataset/NQ_ids2.tsv processing ...
> shape: (109739, 19)
>> ../NQ_dataset/NQ_ids2.tsv saved to ../NQ_dataset_bm25_lexical_truncate_