In [32]:
import torch
import pandas as pd
import numpy as np
from src.transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from sklearn.metrics import classification_report
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [33]:
andmed = pd.read_csv("Rubric_data/estonianvalence.csv", encoding = "utf8", on_bad_lines='skip', header = None, 
                     names = ["rubric","url", "order", "sentiment", "text"])

In [34]:
andmed

Unnamed: 0,rubric,url,order,sentiment,text
0,ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samost-tahan-politseile-kindel-olla/,1,negatiivne,"Enam kui kümme aastat tagasi tegutses huumorisaates «Wremja» inspektor Kukeke, kes kogu aeg vingus väikese palga pärast ja vaatas, mida saaks töö juurest koju tassida. Stsenaristid Andrus Kivirähk ja Mart Juur olid Kukekese isikusse kokku valanud kõik, mis 1990. aastate Eesti politseinikke halvast küljest iseloomustas."
1,ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samost-tahan-politseile-kindel-olla/,2,vastuoluline,"Neid ridu kirjutades tundub isegi ebaviisakas seda karikatuurset kuju meenutada. Juba saate eetrisse mineku ajal oli tegemist vaatega ajalukku. Politsei on vabanenud üleminekuajal parema puudumisel palgatud juhuseotsijatest. Samuti jõudis ühena esimestest politseijuhtideni arusaamine, et avalik kaeblemine palkade üle ei tule ühelegi organisatsioonile kasuks – kannatab maine ja raha juurde ei tule."
2,ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samost-tahan-politseile-kindel-olla/,3,positiivne,"Isiklikult kohtasin natukegi Kukekese moodi politseinikku viimati kaheksa aasta eest Lätis. Eranditult kõik viimase kümnendi kokkupuuted politseiametnikega on kinnitanud: vaatamata raskustele on Eesti riik suutnud korrakaitsjateks värvata inimesi, kes on arukad, kohusetundlikud, lugupidamist sisendavas füüsilises vormis ja hea väljendusoskusega."
3,ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samost-tahan-politseile-kindel-olla/,4,vastuoluline,"Olen näinud ka, kuidas patrull korrarikkujat taltsutab, ning suur osa sellest seisnes enesekindlas olekus ning vastuvaidlemist välistavalt, kuid rahulikult antud korraldustes. Aprillirahutuste ajal veendusime, et Eesti politsei suudab käituda ründajatega väga karmilt. Vaevalt et mitmeks tunniks näoli porisele asfaldile pandud märatsejat lohutas, et kohus hiljem teda riigijuhtide lubatud viisil aastateks vangi ei pannud."
4,ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samost-tahan-politseile-kindel-olla/,5,negatiivne,"Kummaline on nüüd äkki lugeda politsei ja siseministeeriumi ametnike kurtmist kohtu poolt politseinike ründajatele (lihtsalt öeldes – peksjatele) määratud karistuste üle. Tsiteerin ERRile intervjuu andnud politseinikku: «See ei ole normaalne, et sellised jõmikad tulevad, tümitavad ja nii ongi noh. Mingi tagajärg peab saabuma neile, [kes politseiametniku vastu kätt tõstavad].»"
...,...,...,...,...,...
3843,VÄLISMAA,http://www.postimees.ee/998158/madridis-protestisid-tuhanded-inimesed-karpemeetmete-vastu/,3,negatiivne,"Hispaania peaminister Mariano Rajoy on Euroopa Liidu üha kasvava surve all, et ta vähendaks sel aastal eelarvepuudujääki 6,3 protsendini sisemajanduse koguproduktist (SKP), järgmisel aastal 4,5-ni ning 2014. aastal 2,8 protsendini. Kolme aastaga kavatseb valitsus hoida kokku 150 miljardit eurot."
3844,VÄLISMAA,http://www.postimees.ee/998158/madridis-protestisid-tuhanded-inimesed-karpemeetmete-vastu/,4,negatiivne,"Hispaania keskpank on hoiatanud, et riik ei pruugi sel aastal eelarvepuudujääki kavandatud mahus vähendada ning võib libiseda järgmisel aastal sügavasse kriisi."
3845,VÄLISMAA,http://www.postimees.ee/998270/ivanisvili-teeb-teatavaks-ministrikandidaadid/,1,neutraalne,"Gruusia parlamendivalimised võitnud koalitsiooni Gruusia Unistus liider, tulevane peaminister Bidzina Ivanišvili teeb esmaspäeval pressikonverentsil teatavaks ministrikandidaadid, ütles ta pühapäeval Facebookis."
3846,VÄLISMAA,http://www.postimees.ee/998270/ivanisvili-teeb-teatavaks-ministrikandidaadid/,2,neutraalne,Gruusia meedia on juba spekuleerinud tulevase valitsuse koosseisu üle.


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
train, test = train_test_split(andmed, test_size=0.2)
train, val = train_test_split(train, test_size=0.125)

tokenizer = BertTokenizer(vocab_file = "vocab_final.txt", vocab_file_form = "vocab_form.txt", max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt", mask_token="ˇMASKˇ")

In [5]:
%%time
train_encodings = tokenizer(list(train[:200].text), max_length = 128, padding = "max_length", 
                            truncation = True, return_tensors = "pt")
train_dataset = Dataset.from_dict(train_encodings)

val_encodings = tokenizer(list(val[:200].text), max_length = 128, padding = "max_length", 
                            truncation = True, return_tensors = "pt")
val_dataset = Dataset.from_dict(val_encodings)

test_encodings = tokenizer(list(test[:200].text), max_length = 128, padding = "max_length", 
                            truncation = True, return_tensors = "pt")
test_dataset = Dataset.from_dict(test_encodings)

CPU times: total: 18.8 s
Wall time: 19 s


In [6]:
model = BertForMaskedLM.from_pretrained("train_results/checkpoint-100000")
model.to(device)

batch_size = 16

def compute_metrics(p):
    predictions, labels = p
    predictions_lemma = np.argmax(predictions[0], axis = 2).ravel()
    labels_lemma = [y[0] for x in labels for y in x]
    predictions_vorm = np.argmax(predictions[1], axis = 2).ravel()
    labels_vorm = [y[1] for x in labels for y in x]

    final_pred_lemma = [(p,l) for p, l in zip(predictions_lemma, labels_lemma) if l > 4 and l != -100]
    acc_lemma = sum(np.array([x[0] for x in final_pred_lemma]) == np.array([x[1] for x in final_pred_lemma]))/len(final_pred_lemma)

    final_pred_vorm = [(p,l) for p, l in zip(predictions_vorm, labels_vorm) if l > 4 and l != -100]
    acc_vorm = sum(np.array([x[0] for x in final_pred_vorm]) == np.array([x[1] for x in final_pred_vorm]))/len(final_pred_vorm)

    return({'Accuracy_lemma' : acc_lemma, 'Accuracy_vorm' : acc_vorm, 'n_val' : len(final_pred_lemma)})
    
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15
)

args = TrainingArguments(
    "MLM_results",
    evaluation_strategy="epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1
)


trainer = Trainer(
    model,
    args,
    data_collator = data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset, 
    compute_metrics = compute_metrics
)
trainer.evaluate()
trainer.train()


***** Running Evaluation *****
  Num examples = 200
  Batch size = 16


***** Running training *****
  Num examples = 200
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 13
  Number of trainable parameters = 160595620


Epoch,Training Loss,Validation Loss,Accuracy Lemma,Accuracy Vorm,N Val
1,12.3864,9.721108,0.194472,0.242843,1013


***** Running Evaluation *****
  Num examples = 200
  Batch size = 16
Saving model checkpoint to MLM_results\checkpoint-13
Configuration saved in MLM_results\checkpoint-13\config.json
Model weights saved in MLM_results\checkpoint-13\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=13, training_loss=12.3864499605619, metrics={'train_runtime': 263.4975, 'train_samples_per_second': 0.759, 'train_steps_per_second': 0.049, 'total_flos': 38150348390400.0, 'train_loss': 12.3864499605619, 'epoch': 1.0})

In [7]:
# Ennustuste tegemine
predictions, labels, _ = trainer.predict(test_dataset)

predictions_lemma = np.argmax(predictions[0], axis = 2).ravel()
labels_lemma = [y[0] for x in labels for y in x]
predictions_vorm = np.argmax(predictions[1], axis = 2).ravel()
labels_vorm = [y[1] for x in labels for y in x]

final_pred_lemma = [(p,l) for p, l in zip(predictions_lemma, labels_lemma) if l != -100]
acc_lemma = sum(np.array([x[0] for x in final_pred_lemma]) == np.array([x[1] for x in final_pred_lemma]))/len(final_pred_lemma)

final_pred_vorm = [(p,l) for p, l in zip(predictions_vorm, labels_vorm) if l != -100]
acc_vorm = sum(np.array([x[0] for x in final_pred_vorm]) == np.array([x[1] for x in final_pred_vorm]))/len(final_pred_vorm)

print(f"Accuracy lemma : {acc_lemma}")
print(f"Accuracy vorm : {acc_vorm}")
print(f"n test : {len(final_pred_lemma)}")

***** Running Prediction *****
  Num examples = 200
  Batch size = 16


Accuracy lemma : 0.2044790652385589
Accuracy vorm : 0.24050632911392406
n test : 1027


In [5]:
### ESTBERT ###
tokenizer = AutoTokenizer.from_pretrained("tartuNLP/EstBERT", max_length = 128,
                         padding = "max_length", truncation = True, return_tensors = "pt")
train_encodings = tokenizer(list(train[:200].text), max_length = 128, padding = "max_length", 
                            truncation = True, return_tensors = "pt")
train_dataset = Dataset.from_dict(train_encodings)

val_encodings = tokenizer(list(val[:200].text), max_length = 128, padding = "max_length", 
                            truncation = True, return_tensors = "pt")
val_dataset = Dataset.from_dict(val_encodings)

test_encodings = tokenizer(list(test[:200].text), max_length = 128, padding = "max_length", 
                            truncation = True, return_tensors = "pt")
test_dataset = Dataset.from_dict(test_encodings)

In [29]:
from transformers import DataCollatorForLanguageModeling
model = AutoModelForMaskedLM.from_pretrained("tartuNLP/EstBERT")
model.to(device)

batch_size = 16

def compute_metrics(p):
    predictions, labels = p
    predictions_MLM = np.argmax(predictions, axis = 2).ravel()
    labels_MLM = [y for x in labels for y in x]
    final_pred = [(p,l) for p, l in zip(predictions_MLM, labels_MLM) if l != -100]
    acc = sum(np.array([x[0] for x in final_pred]) == np.array([x[1] for x in final_pred]))/len(final_pred)

    return({'Accuracy_MLM' : acc, 'n_val' : len(final_pred)})
    
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15
)

args = TrainingArguments(
    "MLM_results_EST",
    evaluation_strategy="epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1
)


trainer = Trainer(
    model,
    args,
    data_collator = data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset, 
    compute_metrics = compute_metrics
)
#trainer.evaluate()
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 200
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 13
  Number of trainable parameters = 124492880


Epoch,Training Loss,Validation Loss,Accuracy Mlm,N Val
1,4.4461,4.263,0.368193,1616


***** Running Evaluation *****
  Num examples = 200
  Batch size = 16
Saving model checkpoint to MLM_results_EST\checkpoint-13
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=13, training_loss=4.446066636305589, metrics={'train_runtime': 263.2578, 'train_samples_per_second': 0.76, 'train_steps_per_second': 0.049, 'total_flos': 13163232460800.0, 'train_loss': 4.446066636305589, 'epoch': 1.0})

In [30]:
predictions, labels, _ = trainer.predict(test_dataset)

predictions_MLM = np.argmax(predictions, axis = 2).ravel()
labels_MLM = [y for x in labels for y in x]
final_pred = [(p,l) for p, l in zip(predictions_MLM, labels_MLM) if l != -100]
acc = sum(np.array([x[0] for x in final_pred]) == np.array([x[1] for x in final_pred]))/len(final_pred)

print(f"Accuracy MLM : {acc}")
print(f"n test : {len(final_pred)}")

***** Running Prediction *****
  Num examples = 200
  Batch size = 16


Accuracy MLM : 0.3891525423728814
n test : 1475


In [31]:
final_pred

[(6698, 23062),
 (42, 16461),
 (1543, 31532),
 (19642, 31783),
 (46, 68),
 (49904, 42),
 (75, 11361),
 (251, 251),
 (405, 2120),
 (21869, 863),
 (135, 135),
 (40238, 40238),
 (17833, 37),
 (82, 37676),
 (6983, 20489),
 (3063, 3063),
 (3593, 1293),
 (3593, 3593),
 (1387, 39708),
 (34630, 12138),
 (5132, 5455),
 (32333, 6698),
 (49883, 49883),
 (42, 42),
 (15, 15),
 (5657, 5169),
 (251, 251),
 (13523, 3372),
 (280, 4894),
 (49883, 49883),
 (11, 11),
 (35784, 44183),
 (42, 3330),
 (49888, 91),
 (37, 157),
 (126, 126),
 (6455, 6455),
 (15, 429),
 (7477, 20887),
 (42, 3228),
 (137, 4842),
 (710, 710),
 (11, 11),
 (584, 4229),
 (19402, 1097),
 (20, 49889),
 (79, 6847),
 (97, 464),
 (1184, 1184),
 (24694, 194),
 (1036, 34090),
 (10, 49884),
 (37, 11),
 (29890, 34893),
 (37, 157),
 (42490, 42490),
 (37, 37),
 (49887, 49887),
 (5903, 2643),
 (24846, 8386),
 (21729, 21729),
 (7361, 2744),
 (11, 11),
 (90, 90),
 (308, 308),
 (832, 1966),
 (6917, 6917),
 (49883, 1733),
 (153, 153),
 (4780, 1249),
