In [1]:
import transformers
import os
import re
from collections import Counter
from transformers import BertConfig, BertForMaskedLM, BertTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
# create a matrix
from scipy.sparse import lil_matrix
import numpy as np
from foobar import *

In [2]:
def build_cooccur(vocab, corpus_pre, win = 2):
    
    cooccur =  lil_matrix((len(vocab), len(vocab)), dtype=np.float64)
    
    for line in corpus_pre:
        tokens = [t for t in line.strip().split() if t in vocab]
        len_tokens = len(tokens)
        #print(tokens)
        
        for i, tok in enumerate(tokens):
            #print(i)
            #print(tok)
            
            start = max(0, i - win)
            end = min(len_tokens, i + win + 1)
            for j in range(start, end):
                if j != i:
                    distance = np.absolute(j-i)
                    increment = 1.0 / float(distance)
                    #increment = 1.0
                    cooccur[vocab[tok][0], vocab[tokens[j]][0]] += increment
    return cooccur

In [6]:
# Load your corpus
with open('he_wiki_cleaned.txt', 'r', encoding='utf-8') as f:
    corpus_file = [line.strip() for line in f if line.strip()]

In [4]:
vocabthr1000_dic = np.load("vocabthr1000_dic.npy", allow_pickle=True)
vocabthr1000_dic = dict(enumerate(vocabthr1000_dic.flatten(), 1))[1]
len(vocabthr1000_dic)

64856

In [8]:
# When threshold is 10000, there are in total 15135 unique words

vocabthr10000 = [word for word, freq in vocabthr1000_dic.items() if freq[1] >= 10000]
len(vocabthr10000)

15135

In [14]:
def check_word_pairs_in_corpus(word_pairs, corpus_vocab):
    count = 0
    included_pairs = []
    
    for pair, score in word_pairs:
        word1, word2 = pair
        # Check if both words in the pair are present in the corpus
        if word1 in corpus_vocab and word2 in corpus_vocab:
            count += 1
            included_pairs.append(pair)
    
    return count, included_pairs

In [15]:
# When threshold is 10000, there are in total 294 word pairs for Rare dataset

import random

vocab_temp = vocabthr10000
vocab_pro = [1, 0.2, 0.1, 0.05]
test_files = ['wordsim353.txt', 'men_dataset.txt', 'mturk.txt', 'rarewords.txt', 'simlex999.txt']
for j in range(len(vocab_pro)):
    print(vocab_pro[j])
    random.seed(4)
    print(np.round(len(vocab_temp)*vocab_pro[j]).astype(int))

    vocab_temp_sample = random.sample(vocab_temp, np.round(len(vocab_temp)*vocab_pro[j]).astype(int))
    for i in range(len(test_files)):
        path_input = "D:\\20240901paper3\\similarities\\"+test_files[i]
        test_word_pairs_set = read_test_word_pairs_set(path_input)
        not_included = check_word_pairs_in_corpus(test_word_pairs_set, vocab_temp_sample)
        print(not_included[0])

1
15135
294
2159
256
294
810
0.2
3027
14
78
5
16
35
0.1
1514
6
22
1
4
9
0.05
757
2
5
1
2
3


In [17]:
# Save the vocabulary to a file
vocab_file_path = 'vocabthr10000.txt'
with open(vocab_file_path, 'w', encoding='utf-8') as vocab_file:
    for token in vocabthr10000:
        vocab_file.write(f"{token}\n")
        

In [18]:
# Filter the dictionary based on words in vocabthr10000.txt
vocabthr10000_dic = {word: value for word, value in vocabthr1000_dic.items() if word in vocabthr10000}
np.save("vocabthr10000_dic", vocabthr10000_dic)

In [20]:
len(vocabthr10000_dic)

15135

In [19]:
# index starts from 0
unique_values = [value[0] for value in vocabthr10000_dic.values()]
print(unique_values)
value_to_index = {val: idx for idx, val in enumerate(unique_values)}
vocabthr10000_dic_index = {key: (value_to_index[value[0]], value[1]) for key, value in vocabthr10000_dic.items()}
np.save("vocabthr10000_dic_index", vocabthr10000_dic_index)
list(vocabthr10000_dic_index.items())[-5:]

[0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 89, 90, 91, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 153, 154, 155, 156, 157, 158, 160, 161, 162, 163, 164, 165, 166, 168, 169, 170, 173, 174, 175, 176, 177, 178, 180, 181, 182, 184, 185, 186, 189, 190, 192, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 212, 213, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 232, 233, 234, 235, 236, 237, 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 2

[('passeriformesfamily', (15130, 11148)),
 ('stylefontsize', (15131, 12375)),
 ('fuscous', (15132, 11799)),
 ('srcstandings', (15133, 43822)),
 ('startstylescss', (15134, 43822))]

In [21]:
len(vocabthr10000_dic_index)

15135

In [4]:
del vocabthr15000_dic_index

In [5]:
vocabthr10000_dic_index = np.load("vocabthr10000_dic_index.npy", allow_pickle=True)
vocabthr10000_dic_index = dict(enumerate(vocabthr10000_dic_index.flatten(), 1))[1]
len(vocabthr10000_dic_index)

15135

In [7]:
win = 2
cooccur_sparse_thr10000 = build_cooccur(vocabthr10000_dic_index, corpus_file, win)
np.save("cooccur_sparse_thr10000", cooccur_sparse_thr10000)

OSError: [Errno 22] Invalid argument: 'cooccur_sparse_thr10000.npy'

In [12]:
del corpus_file

In [13]:
np.save("D:\\20240901paper3\\20240920\\cooccur_sparse_thr10000", cooccur_sparse_thr10000)

In [10]:
cooccur_sparse_thr10000

<15135x15135 sparse matrix of type '<class 'numpy.float64'>'
	with 98936032 stored elements in List of Lists format>

In [4]:
#load basic matrix and elements that are neeeded for later
cooccur_sparse_thr10000 = np.load("cooccur_sparse_thr10000.npy", allow_pickle=True)
cooccur_sparse_thr10000

array(<15135x15135 sparse matrix of type '<class 'numpy.float64'>'
	with 98936032 stored elements in List of Lists format>, dtype=object)

In [6]:
cooccur_sparse_thr10000q = dict(enumerate(cooccur_sparse_thr10000.flatten(), 1))[1]


In [7]:
cooccur_sparse_thr10000q

<15135x15135 sparse matrix of type '<class 'numpy.float64'>'
	with 98936032 stored elements in List of Lists format>

In [8]:
def check_symmetric(a, tol=1e-8):
    return np.all(np.abs(a-a.T) < tol)
cooccur_dense_thr10000 = cooccur_sparse_thr10000q.todense()
check_symmetric(cooccur_dense_thr10000)


rank_thr10000 = matrix_rank(cooccur_dense_thr10000)
rank_thr10000
np.save("cooccur_dense_thr10000", cooccur_dense_thr10000)

In [3]:
cooccur_dense_thr10000 = np.load("cooccur_dense_thr10000.npy")
def check_symmetric(a, tol=1e-8):
    return np.all(np.abs(a-a.T) < tol)
check_symmetric(cooccur_dense_thr10000)

True

In [9]:
rank_thr10000

15135

In [5]:
# When threshold is 15000, there are in total 11622 unique words

vocabthr15000 = [word for word, freq in vocabthr1000_dic.items() if freq[1] >= 15000]
len(vocabthr15000)

11622

In [24]:
# When threshold is 15000, there are in total 219 word pairs for Rare dataset

import random
vocab_temp = vocabthr15000
vocab_pro = [1, 0.2, 0.1, 0.05]
test_files = ['wordsim353.txt', 'men_dataset.txt', 'mturk.txt', 'rarewords.txt', 'simlex999.txt']
for j in range(len(vocab_pro)):
    print(vocab_pro[j])
    random.seed(4)
    print(np.round(len(vocab_temp)*vocab_pro[j]).astype(int))

    vocab_temp_sample = random.sample(vocab_temp, np.round(len(vocab_temp)*vocab_pro[j]).astype(int))
    for i in range(len(test_files)):
        path_input = "D:\\20240901paper3\\similarities\\"+test_files[i]
        test_word_pairs_set = read_test_word_pairs_set(path_input)
        not_included = check_word_pairs_in_corpus(test_word_pairs_set, vocab_temp_sample)
        print(not_included[0])

1
11622
270
1877
223
219
738
0.2
2324
10
53
10
7
20
0.1
1162
3
16
1
2
8
0.05
581
0
7
1
1
2
