Pipeline to get features on single text

In [1]:
import numpy as np
from tqdm import tqdm, trange
import glob

# src

In [2]:
def get_word_space(model, filename):
    with open(filename, 'r', encoding='utf-8') as f:
        corpus = f.read()
    words = set([w for w in corpus.split() if w in model])
    # words = sorted(words)
    space = np.vstack([model[w] for w in words])
    # print(space.shape)
    return space
    # return space, words

def get_ngram_space(model, filename, n=2):
    with open(filename, 'r', encoding='utf-8') as f:
        corpus = f.read().split()
    ngrams = set()
    ngram_space = []
    for i in range(len(corpus) - n + 1):
        ngram = [corpus[i + j] for j in range(n)]
        flag_out = False
        for w in ngram: 
            if w == '.': 
                flag_out = True
                break
            if w not in model: 
                flag_out = True
                break
        if flag_out: continue
        if " ".join(ngram) in ngrams: continue
        ngram_space.append(np.hstack([model[w] for w in ngram]))
        ngrams.add(" ".join(ngram))
    ngram_space = np.array(ngram_space)
    # print(ngram_space.shape)
    return ngram_space    
    # return ngram_space, ngrams

from sklearn.metrics import pairwise_distances
def get_dist_to_centers_array(e, hole_e_centers):
    a = pairwise_distances(e, hole_e_centers, metric='cosine')
    return np.hstack([a, a.mean(axis=1).reshape(-1, 1)])

from collections import Counter
def get_dist_array(e, hole_embs, apply_func=np.min):
    dist_list = np.vstack([apply_func(pairwise_distances(e, hole, metric='cosine'), axis=1) for hole in hole_embs])
    dist_list = np.vstack([dist_list, dist_list.mean(axis=0)])
    return dist_list.T
def get_most_common_closest_hole(min_dist):
    cnt = Counter(min_dist.argmin(axis=1))
    a = np.array([cnt[hn] for hn in range(min_dist.shape[1])]) / min_dist.shape[0]
    return np.hstack([a, [a.argmax()]])

In [3]:
import pandas as pd

def process(files, model, lang='RU', data_part='Train', text_type='lit'):
    features = []
    text_names = []
    for part in ['word', 'bigram', 'trigram']:
        hole_embeddings = np.load(f"holes/{lang.upper()}/{part}s/hole_embeddings.npy", allow_pickle=True).item()
        hole_e_centers = np.vstack([h.mean(axis=0) for h in hole_embeddings.values()])
        for f in tqdm(files, desc=f"Processing {part}s..."):
            try:
                if part == 'word':
                    space = get_word_space(model, f)
                elif part == 'bigram':
                    space = get_ngram_space(model, f, 2)
                elif part == 'trigram':
                    space = get_ngram_space(model, f, 3)

                c_mean = np.mean(get_dist_to_centers_array(space, hole_e_centers), axis=0)
                m1 = get_dist_array(space, hole_embeddings.values(), np.min)
                m2_mean = np.mean(get_dist_array(space, hole_embeddings.values(), np.max), axis=0)
                h = get_most_common_closest_hole(m1[:,:-1])
                features.append(np.hstack([c_mean, np.mean(m1, axis=0), m2_mean, h]))
                text_names.append(f.split('_')[-1][:-4])
            except:
                continue
        data = pd.DataFrame(features)
        data['text'] = text_names
        data.to_csv(f"features/{data_part}_{lang}_{text_type}_{part}_features.csv", index=False)
        print(f"Saved at features/{data_part}_{lang}_{text_type}_{part}_features.csv")

## dev

In [142]:
%%time
word_space = get_word_space(ru_model, files[0])

CPU times: total: 312 ms
Wall time: 328 ms


In [126]:
%%time
bigram_space = get_ngram_space(ru_model, files[0], 2)

CPU times: total: 1.44 s
Wall time: 1.47 s


In [127]:
%%time
trigram_space = get_ngram_space(ru_model, files[0], 3)

CPU times: total: 1.12 s
Wall time: 1.22 s


In [36]:
part = 'word'
lang = 'RU'
hole_embeddings = np.load(f"holes/{lang.upper()}/{part}s/hole_embeddings.npy", allow_pickle=True).item()
hole_e_centers = np.vstack([h.mean(axis=0) for h in hole_embeddings.values()])

In [88]:
c_mean = np.mean(get_dist_to_centers_array(word_space, hole_e_centers), axis=0)
c_mean.shape

(8,)

In [92]:
m1 = get_dist_array(word_space, hole_embeddings.values(), np.min)
m2_mean = np.mean(get_dist_array(word_space, hole_embeddings.values(), np.max), axis=0)

In [93]:
m1.shape, m2_mean.shape

((13044, 8), (8,))

In [117]:
h = get_most_common_closest_hole(m1[:,:-1])

In [122]:
np.hstack([c, np.mean(m1, axis=0), m2_mean, h]).shape

(32,)

# RU

In [4]:
ru_model = np.load("semantic_space/ru_cbow_dictionary.npy", allow_pickle=True).item()

## Train

### lit

In [None]:
data_part = 'Train'
text_type = 'lit'
lang = 'RU'

files = sorted(glob.glob(f"../DATASET/Russian/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, ru_model, lang=lang, data_part=data_part, text_type=text_type)

2000
['../DATASET/Russian/Train/lit/Train_russian_nofraglit_1.txt', '../DATASET/Russian/Train/lit/Train_russian_nofraglit_1001.txt', '../DATASET/Russian/Train/lit/Train_russian_nofraglit_1002.txt']


Processing words...: 100%|██████████| 2000/2000 [01:22<00:00, 24.34it/s]


Saved at features/Train_RU_lit_word_features.csv


Processing bigrams...:  16%|█▋        | 325/2000 [00:30<01:33, 17.94it/s]

### gpt2

In [None]:
data_part = 'Train'
text_type = 'gpt2'
lang = 'RU'

files = sorted(glob.glob(f"../DATASET/Russian/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, ru_model, lang=lang, data_part=data_part, text_type=text_type)

### balaboba

In [None]:
data_part = 'Train'
text_type = 'balaboba'
lang = 'RU'

files = sorted(glob.glob(f"../DATASET/Russian/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, ru_model, lang=lang, data_part=data_part, text_type=text_type)

## Test

### lit

In [None]:
data_part = 'Test'
text_type = 'lit'
lang = 'RU'

files = sorted(glob.glob(f"../DATASET/Russian/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, ru_model, lang=lang, data_part=data_part, text_type=text_type)

### mGPT

In [None]:
data_part = 'Test'
text_type = 'mGPT'
lang = 'RU'

files = sorted(glob.glob(f"../DATASET/Russian/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, ru_model, lang=lang, data_part=data_part, text_type=text_type)

### lstm

In [None]:
data_part = 'Test'
text_type = 'lstm'
lang = 'RU'

files = sorted(glob.glob(f"../DATASET/Russian/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, ru_model, lang=lang, data_part=data_part, text_type=text_type)

# EN

In [None]:
en_model = np.load("semantic_space/en_cbow_dictionary.npy", allow_pickle=True).item()

## Train

### lit

In [None]:
data_part = 'Train'
text_type = 'lit'
lang = 'EN'

files = sorted(glob.glob(f"../DATASET/English/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, en_model, lang=lang, data_part=data_part, text_type=text_type)

### gpt2

In [None]:
data_part = 'Train'
text_type = 'gpt2'
lang = 'EN'

files = sorted(glob.glob(f"../DATASET/English/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, en_model, lang=lang, data_part=data_part, text_type=text_type)

### balaboba

In [None]:
data_part = 'Train'
text_type = 'balaboba'
lang = 'EN'

files = sorted(glob.glob(f"../DATASET/English/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, en_model, lang=lang, data_part=data_part, text_type=text_type)

## Test

### lit

In [None]:
data_part = 'Test'
text_type = 'lit'
lang = 'EN'

files = sorted(glob.glob(f"../DATASET/English/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, en_model, lang=lang, data_part=data_part, text_type=text_type)

### mGPT

In [None]:
data_part = 'Test'
text_type = 'mGPT'
lang = 'EN'

files = sorted(glob.glob(f"../DATASET/English/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, en_model, lang=lang, data_part=data_part, text_type=text_type)

### lstm

In [None]:
data_part = 'Test'
text_type = 'lstm'
lang = 'EN'

files = sorted(glob.glob(f"../DATASET/English/{data_part}/{text_type}/*.txt"))
print(len(files))
print(files[:3])

process(files, en_model, lang=lang, data_part=data_part, text_type=text_type)