In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from fastai.text.all import *
from utils import *

In [None]:
seq_len = 400
batch_size = 128

In [None]:
dataset_train = get_dataset()

In [None]:
parsed_train = parse_docs(dataset_train)
parsed_train = [el for el in parsed_train if el is not None]

In [None]:
def one_hot_encode_factor(factor):
    result = []
    for pp in factor[1]:
        s = factor[0] + pp
        result.append(s)
    return result

In [None]:
def npa_to_items(npa):
    factors_tree = {}
    for f in npa[2]:
        factors_tree[f[2]] = one_hot_encode_factor(f)
    items = []
    for doc in parse_npa(docx.Document(npa[1])):
        for point, point_text in doc.items():
            point_factors = factors_tree.get(point.strip(' .'), '')
            items.append((str(npa[1]), '\n'.join(point_text), ','.join(point_factors)))
    return items

In [None]:
all_items_orig = []
with Pool(32) as p:
        all_items_orig += list(tqdm(p.imap(npa_to_items, parsed_train), total=len(parsed_train)))


In [None]:
all_items = []
for el in all_items_orig:
    all_items += el

In [None]:
df_train = pd.DataFrame(all_items, columns=['name', 'text', 'factors'])
#df_train['is_valid'] = False
#df_test['is_valid'] = True

df = df_train#pd.concat([df_train, df_test], ignore_index=True)

In [None]:
def filter_df(df, no_corr_coef=1.):
    corr = df[df['factors'].str.len() > 0]
    no_corr = df[df['factors'].str.len() == 0].sample(int(len(corr) * no_corr_coef), random_state=42)
    return pd.concat([corr, no_corr], ignore_index=True)

In [None]:
with open('./models/vocab.pkl', 'rb') as vocab_file:
    vocab = pickle.load(vocab_file)

In [None]:
dblock = DataBlock(
    blocks=(TextBlock.from_df('text', seq_len=seq_len, vocab=vocab), MultiCategoryBlock),
    get_x=ColReader('text'), get_y=ColReader('factors', label_delim=','), splitter=RandomSplitter(0.2))

dls = dblock.dataloaders(filter_df(df, 0.5), bs=batch_size)

In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, 
                                metrics=(accuracy_multi, F1ScoreMulti(), PrecisionMulti(), RecallMulti()), seq_len=seq_len, pretrained=False).to_fp16()
learn.unfreeze()

In [None]:
learn = learn.load_encoder('finetuned')

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(50, 0.004)

In [None]:
preds = learn.get_preds()

In [None]:
v = learn.dls.vocab[1]
factor_results = {}
for f in v:
    factor_results[f] = {'gt': [], 'prob': [], 'pred': []}

threshold = 0.5

counts = {}
pds, gts = preds
for p, g in zip(pds, gts):
    for p, gt, factor in zip(p, g, v):
        factor_results[factor]['gt'].append(gt.item())
        factor_results[factor]['prob'].append(p.item())
        factor_results[factor]['pred'].append(int(p > threshold))
        if p > threshold:
            counts[factor] = counts.get(factor, 0) + 1
print(sorted(counts.items()))

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score
report = []
for factor_name, factor_result in factor_results.items():
    gpred = (factor_result['gt'], factor_result['pred'])
    gprob = (factor_result['gt'], factor_result['prob'])
    report.append((factor_name, f1_score(*gpred), accuracy_score(*gpred), recall_score(*gpred), precision_score(*gpred), roc_auc_score(*gprob)))

pd.DataFrame(report, columns=['Фактор', 'F1', 'Accuracy', 'Recall', 'Precision', 'ROCAUC'])

In [None]:
learn.export('./models/model_50')