In [1]:
import os
import pickle
import torch
from gpt_classes.GPTModel import GPTModel
from gpt_classes.GPTDataset import GPTDataset
from classes.Preprocessor import Preprocessor
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import numpy as np
from dicts import *

In [2]:
base_dir = 'data/'
df = pd.read_csv(os.path.join(base_dir, 'df.csv'), low_memory=False)
df.sample(2)

Unnamed: 0,document.id,source,stage,source_text,lemm_text,city,region,date,data,assessor,...,represent_ethicity_meaning,is_ethicity_superior_raw,is_ethicity_superior_meaning,is_ethicity_aggressor_raw,is_ethicity_aggressor_meaning,is_ethicity_dangerous_raw,is_ethicity_dangerous_meaning,comment,old_id,text_sentiment
65990,938463132,iqbuzz,3,"Едут в купе армянин, грузин и русский. Вдруг г...",ехать купе армянин грузин русский грузин пукну...,Нарьян-Мар,Ненецкий АО,2015-06-22,2017-03-15 13:48:01,mintbreeze,...,,,,,,,,,938463132,
4513,442314563,iqbuzz,2,"Уважаемые Украинцы - настоящие Украинцы, не пр...",уважаемый украинец настоящий украинец предават...,Москва,Москва,2014-04-13,2016-09-30 23:35:57,an_men,...,no,3.0,irrel,3.0,irrel,1.0,no,,басурман_62,


In [3]:
models_directory = 'models'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPTModel(device).to(device)
print(device)

  return self.fget.__get__(instance, owner)()


cuda


In [4]:
files = os.listdir(models_directory)

In [5]:
ids = df['document.id'].unique()
ids.shape

(99,)

In [6]:
RANDOM_STATE = 42
process_ids, test_ids = train_test_split(ids, test_size=0.2, random_state=RANDOM_STATE)
train_ids, validate_ids = train_test_split(process_ids, train_size=0.75, random_state=RANDOM_STATE)

train = df.loc[df['document.id'].isin(train_ids)]
test = df.loc[df['document.id'].isin(test_ids)]
validate = df.loc[df['document.id'].isin(validate_ids)]
train.shape, test.shape, validate.shape  # percents are ≈ (60%, 20%, 20%)

((59, 62), (20, 62), (21, 62))

In [8]:
preprocessor = Preprocessor(df=df, args=args, var_vocab=var_vocab, topic_to_russian=topic_to_russian)

In [9]:
def gen(model):
    torch.cuda.empty_cache()
    all_generated = []
    all_targets = []
    
    test_dataset = GPTDataset(test.copy(), args=args, preprocessor=preprocessor, sp=True)
    test_loader = DataLoader(test_dataset, batch_size=16)

    with torch.no_grad():
        for i, (data, cut_data, ids_) in enumerate(tqdm(test_loader), 1):
            outputs = model(data, cut_data)


            for descr, id_ in zip(data, ids_):
                # descr = el.split("Описание: ", 1)[1]
                # all_targets.append(descr)
                all_targets.append(id_.item())
                
            generated = model.my_generate(cut_data)
            all_generated.extend(generated)
    
    return all_targets, all_generated

In [10]:
people_num = 2

def define(x):
    counts = x.value_counts(dropna=False)
    mode = counts.iloc[0]
    if mode >= people_num and np.sum(counts == mode) == 1:
        return counts.index[0]
    return None

def get_info(data, var, ethnicity=None):
    if ethnicity:
        data_cur = data[data['seed_eth_group'] == ethnicity][var]
    else:
        data_cur = data.drop_duplicates(subset='assessor')[var]
    
    value = define(data_cur)
    labels = var_vocab[var]['labels']
    if value and value in labels:
        return labels[value]

def fit(id_, text):
    
    cnt = 0

    data = df.loc[df['document.id'] == id_]
    sz = data.shape[0]

    if sz == 0:
        return None

    eths = data['seed_eth_group'].unique()

    for var in args:
        if var_vocab[var]['aspect_level']:
            for eth in eths:
                info = get_info(data, var, eth)
                if info:
                    cnt += (info in text and 'не ' + info not in text)
        else:
            info = get_info(data, var)
            if info:
                cnt += (info in text and 'не ' + info not in text)
    return cnt

In [11]:
def get_score(ids_, generated):
    cnt = 0
    for (id_, text) in zip(ids_, generated):
        cnt += fit(id_, text)
    return cnt

In [12]:
results = dict()

In [13]:
for filename in files:
    print(f'loading {filename} model')
    path = os.path.join(models_directory, filename)
    model.load_state_dict(torch.load(path))
    targets, generated = gen(model)
    cnt = get_score(targets, generated)
    results[filename] = cnt

loading epoch_0_num_3_loss_0.6914989824026403 model


  0%|          | 0/2 [00:00<?, ?it/s]

loading epoch_3_num_16_loss_2.3076344839284118 model


  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
for name, cnt in results.items():
    print(f'{name} model score is {cnt}')

epoch_0_num_3_loss_0.6914989824026403 model score is 0
epoch_3_num_16_loss_2.3076344839284118 model score is 0


In [15]:
with open('gpt_scores.pkl', 'wb') as f:
    pickle.dump(results, f)