In [16]:
# ! pip install git+https://github.com/erikavaris/tokenizer.git
# ! pip install tqdm

In [1]:
from tokenizer import tokenizer
from tqdm import tqdm
T = tokenizer.TweetTokenizer(preserve_handles=False, preserve_hashes=False, preserve_case=True, preserve_url=False)

In [2]:
with open('./result.txt') as f:
    sents = f.readlines()
len(sents)

23543676

In [3]:
# remove duplicates
sents = list(set(sents))
len(sents)

14193880

In [4]:
# load dictionary 
import numpy as np
import io
def load_vec(emb_path, nmax=50000):
    word2id = {'<pad>':0, '<unk>':1, '<sos>':2, '<eos>':3}
    vectors = [np.zeros(300) for _ in range(len(word2id))]
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [None]:
src_path = '../MUSE/dumped/6pzywzu6yg/vectors-en.txt'
tgt_path = '..//MUSE/dumped/6pzywzu6yg/vectors-es.txt'
nmax = 1000000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
len(src_id2word)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)
len(tgt_id2word)

In [58]:
# token counts in the document
def batch_process(sents):
    unk_count = 0
    en_count = 0
    es_count = 0
    word_count = 0
    common_count = 0
    for sent in sents:
        sent = T.tokenize(sent)
        for word in sent:
            temp = False
            if word in src_word2id:
                en_count += 1
                temp = True
            if word in tgt_word2id:
                es_count += 1
                if temp:
                    common_count += 1
            if not temp:
                unk_count += 1
            
            word_count += 1
    
    return en_count, es_count, common_count, unk_count, word_count
batch_size = len(sents)//40
batched_sents = [ sents[i:i+batch_size] for i in range(0,len(sents),batch_size) ]
import multiprocessing
pool = multiprocessing.Pool(processes=40)
answer = pool.map(batch_process, (a_batch for a_batch in batched_sents))
pool.close()
answer = np.array(answer)
print("en tokens: ", sum(answer[:,0]))
print("es tokens: ", sum(answer[:,1]))
print("common tokens: ", sum(answer[:,2]))
print("unk tokens", sum(answer[:,3]))
print("total tokens", sum(answer[:,4]))

In [None]:
def each(sent):
    sent = T.tokenize(sent)
        
    unk_count = 0
    en_count = 0
    es_count = 0
    word_count = len(sent)
    
    if word_count <= 0:
        return None
    
    for word in sent:
        temp = False
        if word in src_word2id:
            en_count += 1
            temp = True
        if word in tgt_word2id:
            es_count += 1
            temp = True
            
        if not temp:
            unk_count += 1
            
    if en_count / word_count >= 0.3 and  es_count / word_count >= 0.3:
        return sent 
    else:
        return None

In [None]:
def batch_process(sents):
    new_sents = []
    for sent in sents:
        x = each(sent)
        if x is not None:
            new_sents.append(x)
    return new_sents

In [None]:
batch_size = len(sents)//40
batched_sents = [ sents[i:i+batch_size] for i in range(0,len(sents),batch_size) ]

In [None]:
import multiprocessing
pool = multiprocessing.Pool(processes=40)
answer = pool.map(batch_process, (a_batch for a_batch in batched_sents))
pool.close()

In [None]:
with open("code-mixed.txt", "w") as f:
    for batch in answer:
        for sent in batch:
            f.write(' '.joint(snt) + "\n")