In [13]:
# First instal:
#conda install -c huggingface transformers
#conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch
#conda create -n tf tensorflow
#conda activate tf

In [14]:
#! pip freeze

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [2]:
tokenizer = AutoTokenizer.from_pretrained("tartuNLP/EstBERT")
model = AutoModelForMaskedLM.from_pretrained("tartuNLP/EstBERT")

In [11]:
bert_tagger = BertTagger(bert_location='tartuNLP/EstBERT')

Some weights of BertModel were not initialized from the model checkpoint at tartuNLP/EstBERT and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import torch
from typing import MutableMapping, List
from transformers import BertTokenizer, logging, BertModel

logging.set_verbosity(30)
from estnltk.text import Text
from estnltk.taggers import Tagger
from estnltk.layer.layer import Layer
import numpy as np


class BertTagger(Tagger):
    """Tags BERT embeddings."""

    def __init__(self, bert_location: str, sentences_layer: str = 'sentences',
                 token_level: bool = False,
                 output_layer: str = 'bert_embeddings', bert_layers: List[int] = None, method='concatenate'):

        if bert_layers is None:
            bert_layers = [-4, -3, -2, -1]
        else:
            for layer in bert_layers:
                if abs(layer) > 12:
                    msg = "BERT base model only has 12 layers of transformer encoder, chose layers from (-12..-1). It " \
                          "is reasonable to choose layers from the last layers, for example [-4, -3, -2, -1]: last 4 " \
                          "layers. "
                    raise Exception(msg)
        self.conf_param = ('bert_location', 'bert_model', 'tokenizer', 'method', 'token_level', 'bert_layers')
        if bert_location is None:
            msg = "Directory containing BERT model must be specified."
            raise Exception(msg)
        else:
            self.bert_location = bert_location
        if method not in ('concatenate', 'add', 'all'):
            msg = "Method can be 'concatenate', 'add' or 'all'."
            raise Exception(msg)
        self.method = method
        self.output_layer = output_layer
        self.input_layers = [sentences_layer]

        self.bert_model = BertModel.from_pretrained(bert_location, output_hidden_states=True)

        self.tokenizer = BertTokenizer.from_pretrained(self.bert_location)

        self.output_attributes = ['token', 'bert_embedding']

        self.token_level = token_level
        self.bert_layers = bert_layers

    def _make_layer(self, text: Text, layers: MutableMapping[str, Layer], status: dict) -> Layer:
        sentences_layer = layers[self.input_layers[0]]
        embeddings_layer = Layer(name=self.output_layer, text_object=text, attributes=self.output_attributes,
                                 ambiguous=True)

        start, i = 0, 0
        word_spans = []

        for k, sentence in enumerate(sentences_layer):

            for word in sentence:
                word_spans.append((word.start, word.end, word.text))
            sent_text = sentence.enclosing_text

            embeddings = get_embeddings(sent_text, self.bert_model, self.tokenizer, self.method, self.bert_layers)[
                         1:-1]  # first one in start token, and last one is sep token
            tokens = self.tokenizer.tokenize(sent_text)
            assert len(tokens) == len(embeddings)
            if k != 0:  # move the start manually when next sentence starts
                start = word_spans[i][0]

            if self.token_level:  # annotates tokens
                for j, packed in enumerate(zip(embeddings, tokens)):
                    token_emb, token_init = packed[0], packed[1]

                    if not token_init.startswith("#") and j != 0:  # move to next word
                        if start == word_spans[i][1]:  # BERT's wordpiece tokenizer can tokenize differently
                            i += 1  # next word starts
                            word_span = word_spans[i]
                            start = word_span[0]  # the start id of this word
                    if self.method == 'all':
                        embedding = []
                        for tok_emb in token_emb:
                            emb = []
                            for e in tok_emb:
                                emb.append(float(e))
                            embedding.append(emb)
                    else:
                        embedding = [float(t) for t in token_emb]

                    attributes = {'token': token_init, 'bert_embedding': embedding}
                    token = token_init.strip()
                    embeddings_layer.add_annotation((start, start + len(token.replace('#', ''))), **attributes)
                    start += len(token.replace('#', ''))  # adding token length to the current pointer

                i += 1  # move the pointer manually

            else:  # annotates full words, adding the token level embedding together
                collected_tokens = []
                collected_embeddings = []
                for j, packed in enumerate(zip(embeddings, tokens)):
                    token_emb, token_init = packed[0], packed[1]

                    if i == 0 and j == 0:
                        start = 0
                        collected_embeddings.append(token_emb)
                        collected_tokens.append(token_init)
                        start += len(token_init)

                    if i != 0 and j == 0:
                        collected_tokens = [token_init]
                        collected_embeddings = [token_emb]
                        start = word_spans[i][0]

                        start += len(token_init)

                    if not token_init.startswith("#") and j != 0:  # move to next word
                        if start == word_spans[i][1]:  # BERT's wordpiece tokenizer can tokenize differently

                            if collected_embeddings:
                                if self.method == 'all':
                                    embedding = []
                                    for tok_embs in collected_embeddings:
                                        token_embs = []
                                        for embs in tok_embs:
                                            token_embs_emb = []
                                            for emb in embs:
                                                token_embs_emb.append(float(emb))
                                            token_embs.append(token_embs_emb)
                                        embedding.append(token_embs)
                                else:
                                    embedding = [float(t) for t in np.sum(collected_embeddings, 0)]

                                attributes = {'token': collected_tokens, 'bert_embedding': embedding}
                                embeddings_layer.add_annotation((word_spans[i][0], word_spans[i][1]),
                                                                **attributes)

                            i += 1  # next word starts
                            start = word_spans[i][0]  # the start id of this word
                            collected_embeddings = [token_emb]
                            collected_tokens = [token_init]
                            start += len(token_init)
                        else:
                            start += len(token_init)
                            collected_tokens.append(token_init)
                            collected_embeddings.append(token_emb)
                    elif token_init.startswith("#"):
                        collected_tokens.append(token_init)
                        collected_embeddings.append(token_emb)
                        start += len(token_init.replace("#", ''))

                if collected_tokens:
                    if self.method == 'all':
                        embedding = []
                        for tok_embs in collected_embeddings:
                            token_embs = []
                            for embs in tok_embs:
                                token_embs_emb = []
                                for emb in embs:
                                    token_embs_emb.append(float(emb))
                                token_embs.append(token_embs_emb)
                            embedding.append(token_embs)
                    else:
                        embedding = [float(t) for t in np.sum(collected_embeddings, 0)]

                    attributes = {'token': collected_tokens, 'bert_embedding': embedding}
                    embeddings_layer.add_annotation((word_spans[i][0], word_spans[i][1]),
                                                    **attributes)

                i += 1  # move the pointer manually

        return embeddings_layer


def get_embeddings(sentence: str, model, tokenizer, method, bert_layers):
    input_data = tokenizer.encode_plus(sentence)
    input_ids = input_data.get('input_ids')
    token_vecs_cat = []
    if len(input_ids) > 512:  # maximum sequence length can be 512
        msg = "Input sentence is too big (%s), splitting the sentence." % len(input_ids)
        print(msg)
        collected_input_ids = []
        while True:
            collected_input_ids.append(input_ids[:512])
            input_ids = input_ids[512:]
            if len(input_ids) <= 512:
                collected_input_ids.append(input_ids)
                break
    else:
        collected_input_ids = [input_ids]

    for i, input_ids in enumerate(collected_input_ids):

        segments_ids = [1] * len(input_ids)
        tokens_tensor = torch.tensor([input_ids])
        segments_tensors = torch.tensor([segments_ids])

        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensors)
            hidden_states = outputs[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1, 0, 2)

        for token in token_embeddings:
            if method == 'concatenate':  # concatenate the vectors
                layers = [token[i] for i in bert_layers]
                cat_vec = torch.cat(layers, dim=0)
                token_vecs_cat.append(np.asarray(cat_vec))

            if method == 'add':  # elementwise addition
                layers = [np.asarray(token[i]) for i in bert_layers]
                sum_vec = np.sum(layers, 0)
                token_vecs_cat.append(np.asarray(sum_vec))

            if method == 'all':  # return all
                layers = [np.asarray(token[i]) for i in bert_layers]
                token_vecs_cat.append(layers)

    return token_vecs_cat

In [5]:
import pandas as pd
data = pd.read_csv("Pohitabel.csv")
data.head()

Unnamed: 0,Kohtuasja_ID,Kohtuasja_nr,Lõigu sisu,Pealkiri1,Tag
0,0,C‑416/20 PPU,EUROOPA KOHTU OTSUS (neljas koda),Koda,C19Centre
1,0,C‑416/20 PPU,17. detsember 2020,Kuupaev,C19Centre
2,0,C‑416/20 PPU,Eelotsusetaotlus – Eelotsuse kiirmenetlus – Po...,Viited,C71Indicateur
3,0,C‑416/20 PPU,TR,Poolte_nimed,C02AlineaAltA
4,0,C‑416/20 PPU,"Generalstaatsanwaltschaft Hamburg,",Poolte_nimed,C02AlineaAltA


In [6]:
column_list=data['Lõigu sisu'].to_list()
column_list[0:100]

['EUROOPA KOHTU OTSUS (neljas koda)',
 '17. detsember 2020',
 'Eelotsusetaotlus – Eelotsuse kiirmenetlus – Politseikoostöö ja õigusalane koostöö kriminaalasjades – Raamotsus 2002/584/JSK – Euroopa vahistamismäärus – Artikli 4a lõige 1 – Liikmesriikidevaheline üleandmiskord – Täitmise tingimused – Täitmata jätmise vabatahtlikud alused – Erandid – Täitmise kohustuslikkus – Tagaselja mõistetud karistus – Kahtlustatava või süüdistatava põgenemine – Direktiiv (EL) 2016/343 – Artiklid 8 ja 9 – Õigus viibida kohtulikul arutelul – Nõuded tagaselja süüdimõistmise puhul – Kontrollimine süüdimõistetu üleandmisel',
 'TR',
 'Generalstaatsanwaltschaft Hamburg,',
 'saksa',
 '–        Generalstaatsanwaltschaft Hamburg, esindaja: J. Fröhlich,',
 '–        Saksamaa valitsus, esindajad: J. Möller, M. Hellmann ja F. Halabi,',
 '–        Rumeenia valitsus, esindajad: E. Gane, L.‑E. Batagoi ja A. Wellman,',
 '–        Poola valitsus, esindajad: B. Majczyna ja J. Sawicka,',
 '–        Euroopa Komisjon, esind

In [7]:
column_list_low=[str(x).lower() for x in column_list]   
column_list_low[0:100]

['euroopa kohtu otsus (neljas koda)',
 '17. detsember 2020',
 'eelotsusetaotlus – eelotsuse kiirmenetlus – politseikoostöö ja õigusalane koostöö kriminaalasjades – raamotsus 2002/584/jsk – euroopa vahistamismäärus – artikli 4a lõige 1 – liikmesriikidevaheline üleandmiskord – täitmise tingimused – täitmata jätmise vabatahtlikud alused – erandid – täitmise kohustuslikkus – tagaselja mõistetud karistus – kahtlustatava või süüdistatava põgenemine – direktiiv (el) 2016/343 – artiklid 8 ja 9 – õigus viibida kohtulikul arutelul – nõuded tagaselja süüdimõistmise puhul – kontrollimine süüdimõistetu üleandmisel',
 'tr',
 'generalstaatsanwaltschaft hamburg,',
 'saksa',
 '–        generalstaatsanwaltschaft hamburg, esindaja: j. fröhlich,',
 '–        saksamaa valitsus, esindajad: j. möller, m. hellmann ja f. halabi,',
 '–        rumeenia valitsus, esindajad: e. gane, l.‑e. batagoi ja a. wellman,',
 '–        poola valitsus, esindajad: b. majczyna ja j. sawicka,',
 '–        euroopa komisjon, esind

In [41]:
#text = Text(column_list_low[0])
#text.analyse('segmentation')

text
euroopa kohtu otsus (neljas koda)

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,1
words,normalized_form,,,True,7


In [None]:
#text = Text("Aga mulle tundub, et kogu maailm ootab muusikamaailmalt midagi erutavalt uut minimalismi kõrvale. Tere tulemast")
#text.analyse('segmentation')

In [42]:
#bert_tagger.tag(text)

text
euroopa kohtu otsus (neljas koda)

layer name,attributes,parent,enveloping,ambiguous,span count
paragraphs,,,sentences,False,1
sentences,,,words,False,1
words,normalized_form,,,True,7
bert_embeddings,"token, bert_embedding",,,True,7


In [63]:
#frame=text.bert_embeddings
#frame

layer name,attributes,parent,enveloping,ambiguous,span count
bert_embeddings,"token, bert_embedding",,,True,7

text,token,bert_embedding
euroopa,['euroopa'],"[-0.5070216655731201, 0.8251616358757019, 0.08608068525791168, -0.74435389041900 ..., type: <class 'list'>, length: 3072"
kohtu,['kohtu'],"[-0.4606001377105713, 0.38542890548706055, 0.22001709043979645, -0.1013475954532 ..., type: <class 'list'>, length: 3072"
otsus,['otsus'],"[-1.7654969692230225, -0.28602200746536255, 0.6787710189819336, -0.1449652463197 ..., type: <class 'list'>, length: 3072"
(,['('],"[-0.43563389778137207, -0.18898969888687134, -0.44470086693763733, 0.30623111128 ..., type: <class 'list'>, length: 3072"
neljas,['neljas'],"[0.10023628175258636, -0.0022178590297698975, -0.5721350908279419, 0.23693111538 ..., type: <class 'list'>, length: 3072"
koda,['koda'],"[0.8299240469932556, 1.1443322896957397, -0.5073557496070862, 0.1787079572677612 ..., type: <class 'list'>, length: 3072"
),[')'],"[-1.330999493598938, 1.5082489252090454, 0.1415577083826065, 0.04302344471216202 ..., type: <class 'list'>, length: 3072"


In [71]:
#len(frame['text'])


7

In [72]:
#np.sum(frame['bert_embedding'], axis=0) # saada ühe lõigu summeeritud vektor

array([[-3.56959184,  3.38594219, -0.3977652 , ...,  0.73575374,
         4.82716704,  1.92730936]])

In [84]:
#len(column_list_low)

1057199

In [23]:
# koondab iga sõna vektorid üheks ning lisab need ühte listi
row_nr=[]
vektor=[]
i=0
while i < (len(column_list_low)-1050000):
    text = Text(column_list_low[i])
    text.analyse('segmentation')
    bert_tagger.tag(text)
    frame = text.bert_embeddings
    listike = np.sum(frame['bert_embedding'], axis=0)
    row_nr.append(i)
    vektor.append(listike[0])
    i += 1
print(row_nr)
print(vektor)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


TypeError: unsupported operand type(s) for +: 'ImmutableList' and 'ImmutableList'

In [24]:
#testandmed
test_text = Text("Käive käib käibe juurde")
test_text.analyse('segmentation')
bert_tagger.tag(test_text)
frame = test_text.bert_embeddings
proovivektoriks1=np.sum(frame["bert_embedding"], axis=0)

In [25]:
def vordlus (vektor, row_nr, proovivektor, column_list_low):
    sarnased_maatriks=cosine_similarity(np.stack(vektor, axis=0), Y=proovivektor.reshape(1, -1), dense_output=True)
    sorteeritud_indeksid=np.argsort(np.squeeze(sarnased_maatriks))
    koige_sarnasem_idx=row_nr[sorteeritud_indeksid[len(sorteeritud_indeksid) - 1]]
    return column_list_low[koige_sarnasem_idx]

vordlus(vektor, row_nr, proovivektoriks1, column_list_low)

'jcm europe (uk) ltd'