In [2]:
import pandas as pd
from os import listdir
from pandas.errors import EmptyDataError
from fairseq.models.roberta import RobertaModel, RobertaHubInterface
from fairseq import hub_utils
import os

# AGORA

## Prepare data

In [2]:
# First prepare data (finally output of it will go to another folder, data is too large for github)

In [3]:
wyborcza_articles = []
for filename in listdir('data/wyborcza/articles'):
    try:
        wyborcza_articles.append(pd.read_csv('data/wyborcza/articles/'+filename, header = None))
    except EmptyDataError:
        pass # empty file
wyborcza_articles = pd.concat(wyborcza_articles)
wyborcza_articles.columns = ['url', 'title', 'short', 'long', 'img', 'com']
wyborcza_articles['short'] = wyborcza_articles['short'].str.replace(r'(.|..)\n', '')
wyborcza_articles = wyborcza_articles[~wyborcza_articles['long'].str.contains('W odpowiedzi do @', na = False) == True]
wyborcza_articles = wyborcza_articles[['title', 'short', 'long']]
wyborcza_articles = wyborcza_articles.dropna()
wyborcza_articles = wyborcza_articles[wyborcza_articles['title'].duplicated() == False]
wyborcza_articles = wyborcza_articles[wyborcza_articles['short'] != '0']

In [4]:
gazeta_articles = []
for filename in listdir('data/gazeta/articles'):
    try:
        gazeta_articles.append(pd.read_csv('data/gazeta/articles/'+filename, header = None))
    except EmptyDataError:
        pass # empty file
gazeta_articles = pd.concat(gazeta_articles)
gazeta_articles.columns = ['url', 'title', 'short', 'long', 'img', 'com']
gazeta_articles = gazeta_articles[['title', 'short', 'long']]
gazeta_articles = gazeta_articles[gazeta_articles['title'].duplicated() == False]

## Process data

In [5]:
# Before running below commands create sibling directory do analiza_mediow_pl 'my_roberta' with 'my_data' 
# and 'my_models' dirs

In [8]:
# all data
agora = [" "+x.replace('. ', ' . ').replace(', ', ' , ') for x in pd.concat([
    gazeta_articles,
    wyborcza_articles,
]).astype(str).values.flatten()]

In [9]:
agora[:2]

[' Lider Konfederacji "słuchał z zamkniętymi oczami" . Korwin-Mikke: Ja nie spałem , wrzeszczałem z pięć razy',
 ' We wtorek w mediach pojawiły się zdjęcia z inauguracyjnego posiedzenia Sejmu IX kadencji . Uwagę przykuło zwłaszcza jedno - to , na którym Janusz Korwin-Mikke wygląda tak , jakby spał . Poseł Konfederacji Wolność i Niepodległość przekonuje , że wcale nie uciął sobie drzemki.']

In [10]:
test_end = int(0.03 * len(agora))
valid_end = int(0.06 * len(agora))

In [11]:
agora_test = agora[:test_end]
agora_valid = agora[test_end:valid_end]
agora_train = agora[valid_end:]

In [12]:
with open('/my_roberta/my_data/agora/agora.all.raw', 'w') as f:
    f.write('\n'.join(agora))

In [14]:
with open('/my_roberta/my_data/agora.test.raw', 'w') as f:
    f.write('\n'.join(agora_test))
with open('/my_roberta/my_data/agora.valid.raw', 'w') as f:
    f.write('\n'.join(agora_valid))
with open('/my_roberta/my_data/agora.train.raw', 'w') as f:
    f.write('\n'.join(agora_train))

## Train

In [None]:
#             !!! Those commands run in my_roberta directory !!! #

In [None]:
# encode (copy paste to terminal, my_roberta dir)
for SPLIT in train valid test; do \
    python -m examples.roberta.multiprocessing_bpe_encoder \
        --encoder-json '../analiza_mediow_pl/roberta_meta/encoder.json' \
        --vocab-bpe '../analiza_mediow_pl/roberta_meta/vocab.bpe' \
        --inputs my_data/agora.${SPLIT}.raw \
        --outputs my_data/agora.${SPLIT}.bpe \
        --keep-empty \
        --workers 60; \
done

In [None]:
# binarize (copy paste to terminal, my_roberta dir)
fairseq-preprocess \
    --only-source \
    --srcdict '/roberta/dict.txt' \
    --trainpref my_data/agora/agora.train.bpe \
    --validpref my_data/agora/agora.valid.bpe \
    --testpref my_data/agora/agora.test.bpe \
    --destdir data-bin/agora \
    --workers 60

In [None]:
# train (copy paste to terminal, my_doberta dir)
fairseq-train --fp16 'data-bin/agora' \
    --task masked_lm --criterion masked_lm \
    --arch roberta_base --sample-break-mode complete --tokens-per-sample 512 \
    --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr 0.0005 --warmup-updates 50 \
    --total-num-update 500000 \
    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
    --max-sentences 8 --update-freq 32 \
    --max-update 500000 --log-format simple --log-interval 1 \
    --restore-file '/roberta/checkpoint_best.pt' --skip-invalid-size-inputs-valid-test \
    --save-dir my_models/agora --min-lr 0

In [None]:
# Cannot load model parameters from checkpoint /roberta/checkpoint_best.pt; please ensure that the architectures match.

## Check 

In [None]:
# download from https://github.com/sdadas/polish-nlp-resources/releases/download/roberta/roberta.zip
# extract and place it above this repo

In [37]:
# base pl model trained on wikipedia 
model_path = "/roberta"
loaded = hub_utils.from_pretrained(
    model_name_or_path=model_path,
    checkpoint_file="checkpoint_best.pt",
    data_name_or_path=model_path,
    bpe="sentencepiece",
    sentencepiece_vocab=os.path.join(model_path, "sentencepiece.model"),
    load_checkpoint_heads=True,
    archive_map=RobertaModel.hub_models(),
    cpu=True
)
roberta = RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0])

In [55]:
roberta.fill_mask('Bolesław chrobry urodził się w <mask>.', topk = 1)

[('Bolesław chrobry urodził się w Krakowie.',
  0.16482116281986237,
  ' Krakowie')]

In [3]:
# my
model_path = "/roberta"
loaded = hub_utils.from_pretrained(
    model_name_or_path="/my_roberta/my_models/agora",
    checkpoint_file="checkpoint169.pt",
    data_name_or_path="/my_roberta/data-bin/agora",
    bpe="sentencepiece",
    sentencepiece_vocab='/my_roberta/my_data/agora/agora.spm.model.model',
    load_checkpoint_heads=True,
    archive_map=RobertaModel.hub_models(),
    cpu=True
)
agora_roberta = RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0])

72

In [36]:
# check volatity
def get_idx(res, word):
    return res[res['word'] == ' '+word].index.values[0]
def get_p(res, word):
    return res[res['word'] == ' '+word]['p'].values[0]

data_list = []
sentence = 'Wiele można powiedzieć o Donaldzie Tusku, na pewno jest on jednak <mask> politykiem.'
for i in range(150, 169):
    loaded = hub_utils.from_pretrained(
        model_name_or_path="/my_roberta/my_models/agora",
        checkpoint_file="checkpoint"+str(i)+".pt",
        data_name_or_path="/my_roberta/data-bin/agora",
        bpe="sentencepiece",
        sentencepiece_vocab='/my_roberta/my_data/agora/agora.spm.model.model',
        load_checkpoint_heads=True,
        archive_map=RobertaModel.hub_models(),
        cpu=True
    )
    agora_roberta = RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0])
    res = pd.DataFrame(
        agora_roberta.fill_mask(sentence, topk = 100),
        columns = ['sentence', 'p', 'word']
    )
    try:
        row = []
        row.append(get_idx(res, 'dobrym'))
        row.append(get_idx(res, 'złym'))
        row.append(get_p(res, 'dobrym'))
        row.append(get_p(res, 'złym'))
        data_list.append(row)
    except IndexError:
        pass

In [39]:
final_res = pd.DataFrame(data_list, columns = ['good_idx', 'bad_idx', 'good_p', 'bad_p'])
final_res['score'] = final_res['good_p'] / (final_res['good_p']+final_res['bad_p'])

In [40]:
final_res

Unnamed: 0,good_idx,bad_idx,good_p,bad_p,score
0,1,68,0.063601,0.000893,0.986159
1,3,61,0.071187,0.001517,0.979138
2,9,96,0.022706,0.000678,0.970992
3,2,77,0.057736,0.001137,0.980693
4,10,77,0.017909,0.001108,0.941719
5,2,78,0.069389,0.000919,0.986924


In [42]:
final_res['score'].std()

0.01695127480926457

## Second version

In [None]:
# https://github.com/google/sentencepiece
# https://github.com/pytorch/fairseq/issues/1186

In [None]:
# make spm model
spm_train \
    --input=my_data/agora/agora.all.raw \
    --model_prefix=my_data/agora/agora.spm.model \
    --vocab_size=50000

In [None]:
# encode
for SPLIT in train valid test; do \
    cat my_data/agora/agora.${SPLIT}.raw | \
    spm_encode --model=my_data/agora/agora.spm.model.model --output_format=piece > \
    my_data/agora/agora.${SPLIT}.bpe
done

In [None]:
# binarize 
fairseq-preprocess \
    --only-source \
    --trainpref my_data/agora/agora.train.bpe \
    --validpref my_data/agora/agora.valid.bpe \
    --testpref my_data/agora/agora.test.bpe \
    --destdir data-bin/agora \
    --workers 60

In [None]:
fairseq-train --fp16 'data-bin/agora' \
    --task masked_lm --criterion masked_lm --encoder-layers 3\
    --arch roberta_base --sample-break-mode complete --tokens-per-sample 512 \
    --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr 0.0001 --warmup-updates 0 \
    --total-num-update 500000 \
    --dropout 0.01 --attention-dropout 0.01 --weight-decay 0.01 \
    --max-sentences 8 --update-freq 64 \
    --max-update 500000 --log-format simple --log-interval 1 \
    --skip-invalid-size-inputs-valid-test \
    --save-dir my_models/agora --min-lr 0

# TVP

In [19]:
tvp_articles = []
for filename in listdir('data/tvp/articles'):
    try:
        tvp_articles.append(pd.read_csv('data/tvp/articles/'+filename, header = None))
    except EmptyDataError:
        pass # empty file
tvp_articles = pd.concat(tvp_articles)
tvp_articles.columns = ['title', 'short', 'long', 'img', 'com']
tvp_articles = tvp_articles[['title', 'short', 'long']]
tvp_articles = tvp_articles[tvp_articles['title'].duplicated() == False]

In [5]:
# Before running below commands create sibling directory do analiza_mediow_pl 'my_roberta' with 'my_data' 
# and 'my_models' dirs

In [22]:
# all data
tvp = [" "+x.replace('. ', ' . ').replace(', ', ' , ') 
         for x in tvp_articles.astype(str).values.flatten()]

In [23]:
tvp[:2]

[' Macron straszy karami za nieprzyjmowanie migrantów',
 ' Za „poważnym karaniem” państw Unii , które odmawiają udziału w mechanizmie dystrybucji migrantów , opowiedział się Emmanuel Macron . Prezydent Francji spotkał się w Rzymie z premierem Włoch Giuseppem Contem.']

In [24]:
test_end = int(0.03 * len(tvp))
valid_end = int(0.06 * len(tvp))

In [25]:
tvp_test = tvp[:test_end]
tvp_valid = tvp[test_end:valid_end]
tvp_train = tvp[valid_end:]

In [27]:
with open('/my_roberta/my_data/tvp/tvp.all.raw', 'w') as f:
    f.write('\n'.join(tvp))

In [28]:
with open('/my_roberta/my_data/tvp/tvp.test.raw', 'w') as f:
    f.write('\n'.join(tvp_test))
with open('/my_roberta/my_data/tvp/tvp.valid.raw', 'w') as f:
    f.write('\n'.join(tvp_valid))
with open('/my_roberta/my_data/tvp/tvp.train.raw', 'w') as f:
    f.write('\n'.join(tvp_train))

In [None]:
spm_train \
    --input=my_data/tvp/tvp.all.raw \
    --model_prefix=my_data/tvp/tvp.spm.model \
    --vocab_size=50000

In [None]:
for SPLIT in train valid test; do \
    cat my_data/tvp/tvp.${SPLIT}.raw | \
    spm_encode --model=my_data/tvp/tvp.spm.model.model --output_format=piece > \
    my_data/tvp/tvp.${SPLIT}.bpe
done

In [None]:
fairseq-preprocess \
    --only-source \
    --trainpref my_data/tvp/tvp.train.bpe \
    --validpref my_data/tvp/tvp.valid.bpe \
    --testpref my_data/tvp/tvp.test.bpe \
    --destdir data-bin/tvp \
    --workers 60

In [None]:
fairseq-train --fp16 'data-bin/tvp' \
    --task masked_lm --criterion masked_lm --encoder-layers 3\
    --arch roberta_base --sample-break-mode complete --tokens-per-sample 512 \
    --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
    --lr-scheduler polynomial_decay --lr 0.0001 --warmup-updates 0 \
    --total-num-update 500000 \
    --dropout 0.01 --attention-dropout 0.01 --weight-decay 0.01 \
    --max-sentences 8 --update-freq 64 \
    --max-update 500000 --log-format simple --log-interval 1 \
    --skip-invalid-size-inputs-valid-test \
    --save-dir my_models/tvp --min-lr 0

In [46]:
model_path = "/roberta"
loaded = hub_utils.from_pretrained(
    model_name_or_path="/my_roberta/my_models/tvp",
    checkpoint_file="checkpoint373.pt",
    data_name_or_path="/my_roberta/data-bin/tvp",
    bpe="sentencepiece",
    sentencepiece_vocab='/my_roberta/my_data/tvp/tvp.spm.model.model',
    load_checkpoint_heads=True,
    archive_map=RobertaModel.hub_models(),
    cpu=True
)
tvp_roberta = RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0])

In [52]:
tvp_roberta.fill_mask('Aleksander Kwaśniewski to <mask> polityk')

[('Aleksander Kwaśniewski to były polityk', 0.7797557711601257, ' były'),
 ('Aleksander Kwaśniewski to nie polityk', 0.07706533372402191, ' nie'),
 ('Aleksander Kwaśniewski to wybitny polityk',
  0.01950007677078247,
  ' wybitny'),
 ('Aleksander Kwaśniewski to lewicowy polityk',
  0.011795088648796082,
  ' lewicowy'),
 ('Aleksander Kwaśniewski to być polityk', 0.007840156555175781, ' być')]