# Clustering with embeddings


In [24]:
from flair.data import Corpus, Sentence
from flair.datasets import CSVClassificationCorpus
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

import matplotlib.pyplot as plt
from nltk import sent_tokenize
import pandas as pd

from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv("../corpora/german_modern_poems_epochpoet.csv")

In [10]:
df.head(1)

Unnamed: 0,pid,filename,poet,title,year,poem,poemlength,epoch_year,epoch_poet
0,44927,"Heym, Georg_Die Züge_1899","Heym, Georg",Die Züge,1899,"Rauchwolken, rosa, wie ein Frühlingstag, Die s...",108,Jahrhundertwende,Expressionismus


In [8]:
train, val = train_test_split(df, test_size=0.4, shuffle=True)
val, test = train_test_split(val, test_size=0.5, shuffle=True)
train.to_csv("../corpora/train.csv", index = False)
val.to_csv("../corpora/dev.csv", index = False)
test.to_csv("../corpora/test.csv", index = False)

In [12]:
# this is the folder in which train, test and dev files reside
data_folder = '../corpora/'

# column format indicating which columns hold the text and label(s)
column_name_map = {6: "poem", 8: "epoch_year", 9: "epoch_poet"}

# load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map,
                                         skip_header=True,
                                         delimiter=',') 

2020-05-28 15:23:46,525 Reading data from ../corpora
2020-05-28 15:23:46,530 Train: ../corpora/train.csv
2020-05-28 15:23:46,539 Dev: ../corpora/dev.csv
2020-05-28 15:23:46,545 Test: ../corpora/test.csv


In [13]:
label_dict = corpus.make_label_dictionary()

2020-05-28 15:24:14,870 Computing label dictionary. Progress:


0it [00:00, ?it/s]

2020-05-28 15:24:15,135 []





In [21]:
document_embeddings = TransformerDocumentEmbeddings('bert-base-german-cased', fine_tune=True)

In [26]:
sentence = Sentence('Kartoffel')

# embed words in sentence
document_embeddings.embed(sentence)

[Sentence: "Kartoffel"   [− Tokens: 1]]

In [22]:
# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

In [23]:
trainer.train('resources/taggers/trec',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)

2020-05-28 15:48:53,691 ----------------------------------------------------------------------------------------------------
2020-05-28 15:48:53,704 Model: "TextClassifier(
  (document_embeddings): TransformerDocumentEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30000, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
               

2020-05-28 15:48:53,706 ----------------------------------------------------------------------------------------------------
2020-05-28 15:48:53,712 Corpus: "Corpus: 0 train + 0 dev + 0 test sentences"
2020-05-28 15:48:53,721 ----------------------------------------------------------------------------------------------------
2020-05-28 15:48:53,731 Parameters:
2020-05-28 15:48:53,765  - learning_rate: "0.1"
2020-05-28 15:48:53,782  - mini_batch_size: "32"
2020-05-28 15:48:53,787  - patience: "5"
2020-05-28 15:48:53,789  - anneal_factor: "0.5"
2020-05-28 15:48:53,805  - max_epochs: "150"
2020-05-28 15:48:53,807  - shuffle: "True"
2020-05-28 15:48:53,809  - train_with_dev: "False"
2020-05-28 15:48:53,814  - batch_growth_annealing: "False"
2020-05-28 15:48:53,821 ----------------------------------------------------------------------------------------------------
2020-05-28 15:48:53,823 Model training base path: "resources/taggers/trec"
2020-05-28 15:48:53,830 -----------------------------

ValueError: num_samples should be a positive integer value, but got num_samples=0