In [1]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.join('..'))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [11]:
import tensorflow as tf
import numpy as np

In [12]:
test_corpus = np.array([
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
])
y = np.array([0,0,0,0])

In [81]:
path_data, path_authors = get_path_to_gutenberg_sets(10, 3, PATH_TO_DATASET_FOLDER_TEST)
train, valid, test = get_datasets(path_data, ';', None)

Loading dataset from=C:\Users\Vojta\Desktop\diploma\data_test\gutenberg\10Authors\Sentence3\train.csv
Loading dataset from=C:\Users\Vojta\Desktop\diploma\data_test\gutenberg\10Authors\Sentence3\valid.csv
Loading dataset from=C:\Users\Vojta\Desktop\diploma\data_test\gutenberg\10Authors\Sentence3\test.csv


In [15]:
dataset = tf.data.Dataset.from_tensor_slices((test_corpus, y))

In [16]:
for x in dataset:
    print(x)
    break

(<tf.Tensor: shape=(), dtype=string, numpy=b'This is the first document.'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)


# BoW Vectorizer

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
BoWVectorizer = CountVectorizer

In [35]:
vectorizer = BoWVectorizer()
X = vectorizer.fit_transform(test_corpus)
vectorizer.get_feature_names_out()
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

# TFIDF Vectorizer

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
TFIDFVectorizer= TfidfVectorizer

In [38]:

vectorizer = TFIDFVectorizer()
X = vectorizer.fit_transform(test_corpus)
vectorizer.get_feature_names_out()
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

# Transformer Vectorizer

In [22]:
from transformers import TFAutoModel
from src.types.transformer_name import TransformerName
from src.tokenizers.transformer_tokenizer import TransformerTokenizer
from src.encoder.create_encoder_from_path import create_encoder_from_path
from src.tokenizers.prepare_dataset_from_tokenizer import prepare_dataset_from_tokenizer
from src.types.transformer_pooling import TransformerPooling
from src.models.transformer.bert_pooling_layer import BertPoolingLayer
from transformers import AutoConfig


In [112]:
class TransformerVectorizer():
    def __init__(
        self, 
        transformer_type,
        transformer_pooling_type,
        path_authors=None,
        encoder=None, 
        max_len=512, 
        preprocess_pipeline=None,
    ):
        self.transformer_type = transformer_type.value
        self.transformer_pooling_type = transformer_pooling_type
        self.encoder = encoder
        self.max_len = max_len
        self.preprocess_pipeline = preprocess_pipeline
        self.path_to_authors = path_authors
        self.setup()

    def setup(self):
        self.config = AutoConfig.from_pretrained(self.transformer_type, output_hidden_states=True)
        self.transformer = TFAutoModel.from_config(self.config)
        encoder = None if self.path_to_authors is None else create_encoder_from_path(self.path_to_authors)
        self.tokenizer = TransformerTokenizer(
            self.transformer_type, 
            encoder
        )

    def fit_transform(self, dataset):
        #imagine X as list of sentences
        sentence_embedding = []
        labels = []

        for x in prepare_dataset_from_tokenizer(dataset, self.tokenizer).batch(1):
            transformer_input, label = x
            output = self.transformer(
                transformer_input, 
                output_hidden_states=True
            )
            
            output = BertPoolingLayer()(
                output, 
                self.transformer_pooling_type
            )

            output = output.numpy().reshape(-1)
            label = label.numpy()[0] 

            labels.append(label)
            sentence_embedding.append(output)
            break
        return np.array(sentence_embedding), np.array(labels)

    def create_embedding_matrix(self, X):
        #TODO: add to embedding layer
        pass

In [115]:
class BertBaseUncasedVectorizer(TransformerVectorizer):
    def __init__(
        self, 
        transformer_pooling_type,
        path_authors=None,
        encoder=None, 
        max_len=512, 
        preprocess_pipeline=None,
    ):
        super().__init__(
            TransformerName.BertBaseUncased,         
            transformer_pooling_type,
            path_authors=None,
            encoder=None, 
            max_len=512, 
            preprocess_pipeline=None
        )

In [116]:
bert = BertBaseUncasedVectorizer()
X, y = bert.fit_transform(train)

In [117]:
X.shape

(1, 768)

In [42]:
class DistilBertBaseUncasedVectorizer(TransformerVectorizer):
    def __init__(
        self, 
        transformer_pooling_type,
        path_authors=None,
        encoder=None, 
        max_len=512, 
        preprocess_pipeline=None,
    ):
        super().__init__(
            TransformerName.DistilBertBaseUncased,         
            transformer_pooling_type,
            path_authors=None,
            encoder=None, 
            max_len=512, 
            preprocess_pipeline=None
        )

In [44]:
class ElectraSmallVectorizer(TransformerVectorizer):
    def __init__(
        self, 
        transformer_pooling_type,
        path_authors=None,
        encoder=None, 
        max_len=512, 
        preprocess_pipeline=None,
    ):
        super().__init__(
            TransformerName.ElectraSmall,         
            transformer_pooling_type,
            path_authors=None,
            encoder=None, 
            max_len=512, 
            preprocess_pipeline=None
        )

In [5]:
import gensim.downloader
from src.types.downloaded_embeddings_type import DownloadedEmbeddingType
import numpy as np

In [67]:
class EmbeddingVectorizer:
    def __init__(self, embedding_type):
        self.embedding_type = embedding_type.value
        self.missed = 0
        self.counter = 0
        self.embedding_size = 0
        self.setup()

    def setup(self):
        self.vectors = gensim.downloader.load(self.embedding_type)
        self.embedding_size = len(self.vectors['king'])

    def get_from_vectors(self, key_vectors, key):
        self.counter += 1
        try:
            return key_vectors[key]
        except:
            self.missed += 1
            return np.zeros(shape=(self.embedding_size, ))

    def get_state(self):
        missed, counter, accuracy = self.missed, self.counter, 100 * (self.missed / self.counter)
        print(f"Missed={missed}, counter={counter}, accuracy={accuracy}")
        return missed, counter, accuracy

    def get_mean(self, corpus):
        return [np.mean(sent, axis=0) for sent in corpus]

    def fit_transform(self, X):
        self.missed = 0
        self.counter = 0

        corpus = []
        
        for sentence in X:
            tokens = sentence.split(" ")
            sentence_embedding = []
            for token in tokens:
                embedding_of_token = self.get_from_vectors(self.vectors, token)
                sentence_embedding.append(embedding_of_token)
            corpus.append(np.array(sentence_embedding))

        return self.get_mean(corpus)

    def create_embedding_matrix(self, X):
        #TODO: add to embedding layer
        pass

In [68]:
embedding = EmbeddingVectorizer(DownloadedEmbeddingType.Glove)

In [69]:
X = embedding.fit_transform(test_corpus)

In [70]:
np.array(X).shape

(4, 300)

In [71]:
embedding.get_state()

(4, 22, 18.181818181818183)

# Embeddings downloaded

## Glove Vectorizer

In [None]:
class GloveVectorizer(EmbeddingVectorizer):
    def __init__(self):
        super().__init__(DownloadedEmbeddingType.Glove)

In [None]:
glove_vectorizer = GloveVectorizer()

## Word2Vec Vectorizer

In [None]:
class Word2VecVectorizer(EmbeddingVectorizer):
    def __init__(self):
        super().__init__(DownloadedEmbeddingType.Word2Vec)

In [None]:
word2vec_vectorizer = Word2VecVectorizer