In [1]:
from fairseq.models.roberta import RobertaModel, RobertaHubInterface
from fairseq import hub_utils
from os import listdir
import pandas as pd

In [253]:
def load_model(brand, filename):
    loaded = hub_utils.from_pretrained(
        model_name_or_path="/my_roberta/my_models/"+brand,
        checkpoint_file=filename,
        data_name_or_path="/my_roberta/data-bin/"+brand,
        bpe="sentencepiece",
        sentencepiece_vocab='/my_roberta/my_data/{}/{}.spm.model.model'.format(brand, brand),
        load_checkpoint_heads=True,
        archive_map=RobertaModel.hub_models(),
        cpu=True
    )
    return RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0])

In [269]:
def normalize_results(df):
    res_list = []
    for pred_type in df['type'].unique():
        tmp_results = df[df['type'] == pred_type]
        tmp_results = tmp_results\
            .groupby(['brand', 'keyword'])\
            .mean()\
            .reset_index()\
            .pivot('keyword', 'brand', 'score')
        tmp_results = (tmp_results - tmp_results.mean()) / tmp_results.std()
        tmp_results = ((tmp_results.T - tmp_results.min(axis=1)) / (tmp_results.max(axis=1) - tmp_results.min(axis=1))).T
        res_list.append(tmp_results)
    results = pd.concat(res_list)
    results.index = results.index.str.replace('są', '').str.replace('jest', '')
    return results

In [255]:
sentence_keys = [
    {
        "sentence": "{} to <mask> polityk.",
        "keywords": ("Andrzej Duda", "Jarosław Kaczyński", "Donald Tusk", "Tadeusz Rydzyk",
                     "Beata Szydło", "Mateusz Morawiecki", "Aleksander Kwaśniewski", "Donald Trump",
                     "Władimir Putin", "Janusz Korwin-Mikke",),
        "reference": {
            "good": [" "+x for x in ["ważny", "doświadczony", "dobry", "uczciwy", "silny", "skuteczny", "szczery",
                                     "utalentowany", "mądry", "kompetentny", "doskonały", "ulubiony", "zdolny",
                                     "szanowany", "ważny", "poważny", "najwazniejszy", "najlepszy", "wybitny",
                                     "wielki", "doświadczony", "wpływowy", "wybitny", "prominentny"]],
            "bad": [" "+x for x in ["niebezpieczny", "słaby", "zły", "arogancki", "nieudolny"]]
        },
        "type": "politician"
    },
    {
        "sentence": "{} bardzo <mask> krajem.",
        "keywords": ['Rosja jest', 'Stany są', 'Niemcy są', 'Ukraina jest'],
        "reference": {
            "good": [" "+x for x in ['ważnym', 'bogatym', 'dobrym', 'popularnym', 'bezpiecznym', 'otwartym', 
                                     'stabilnym', 'silnym', 'wiarygodnym', 'pięknym']],
            "bad": [" "+x for x in['złym', 'trudnym', 'słabym', 'niebezpiecznym', 'biednym']],
        },
        "type": "country"
    }
]

In [256]:
# b = 'agora'
# load_model(b, list(sorted(listdir('/my_roberta/my_models/'+b)))[-1])\
#     .fill_mask('Stany Zjednoczone są bardzo <mask> krajem.', 50)

In [271]:
df_list = []
for brand in ['agora', 'tvp', 'ringier', 'sjegodnia']:
    ten_last_models = list(sorted(listdir('/my_roberta/my_models/'+brand)))[-12:-2]
    for model_name in ten_last_models:
        model = load_model(brand, model_name)
        for sentence_key in sentence_keys:
            for key in sentence_key["keywords"]:
                predictions = model.fill_mask(sentence_key["sentence"].format(key), 300)
                try:
                    good_p = sum([x[1] for x in predictions if x[2] in sentence_key["reference"]["good"]])
                    bad_p = sum([x[1] for x in predictions if x[2] in sentence_key["reference"]["bad"]])
                    score = good_p / (good_p+bad_p)
                    df_list.append([
                        brand,
                        sentence_key["type"],
                        key,
                        score,
                        good_p,
                        good_p - bad_p
                    ])
                except IndexError:
                    pass

In [273]:
tmp_results = pd.DataFrame(df_list, columns = ['brand', 'type', 'keyword', 'tmp_score', 'tmp_g_p', 'tmp_gb_p'])

In [280]:
df = tmp_results[['brand', 'type', 'keyword', 'tmp_score']]
df.columns = ['brand', 'type', 'keyword', 'score']

In [281]:
normalize_results(df)

brand,agora,ringier,sjegodnia,tvp
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aleksander Kwaśniewski,0.262066,0.0,1.0,0.022187
Andrzej Duda,0.0,1.0,0.862885,0.773075
Beata Szydło,1.0,0.744023,0.0,0.16706
Donald Trump,0.330585,0.090814,0.0,1.0
Donald Tusk,0.0,0.366304,0.782242,1.0
Janusz Korwin-Mikke,1.0,0.726575,0.0,0.06855
Jarosław Kaczyński,0.0,0.229071,0.139643,1.0
Mateusz Morawiecki,0.753929,0.0,0.695265,1.0
Tadeusz Rydzyk,0.0,0.068274,1.0,0.688986
Władimir Putin,0.704603,0.830007,1.0,0.0
