imports

In [1]:
import ast
import seaborn as sns
from plotly.offline import download_plotlyjs, init_notebook_mode, plot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from tg.grammar_ru.common import Loc
from tg.grammar_ru.corpus import CorpusReader, CorpusBuilder, BucketCorpusBalancer
from tg.grammar_ru.corpus.corpus_reader import read_data
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Loc.root_path / 'environment.env')
from tg.grammar_ru.components.yandex_storage.s3_yandex_helpers import S3YandexHandler
from tg.grammar_ru.components.yandex_delivery.training_logs import S3TrainingLogsLoader, TrainingLogsViewer

from yo_fluq_ds import Queryable, Query, fluq
import plotly.express as px
from tg.grammar_ru.common import Separator

from typing import List, Union
import numpy as np
import torch
import math
import pandas as pd
from sklearn.metrics import confusion_matrix
from tg.common import DataBundle
from tg.common.ml.batched_training import IndexedDataBundle
from tg.grammar_ru.components.plain_context_builder import PlainContextBuilder

pd.set_option('display.max_rows', 500)



def get_tasks(bucket, tasks_list_s3_path):
    tmp_local_file = Loc.temp_path / tasks_list_s3_path.split('/')[-1]
    S3YandexHandler.download_file(bucket, tasks_list_s3_path, tmp_local_file)
    with open(tmp_local_file, 'r') as f:
        tasks = ast.literal_eval(f.read())
    return tasks


def plot_metrics(metrics, title=""):
    plt.plot(TrainingLogsViewer.get_metric_by_job(
        metrics, 'accuracy_display'), label='accuracy_display')
    plt.plot(TrainingLogsViewer.get_metric_by_job(
        metrics, 'accuracy_test'), label='accuracy_test')
    plt.title(title)
    plt.legend()
    plt.show()

def plot_loss(metrics, title=""):
    plt.plot(TrainingLogsViewer.get_metric_by_job(
        metrics, 'loss'), label='loss')
    plt.title(title)
    plt.legend()
    plt.show()


def plot_cm(cm):
    fig = go.Figure(data=go.Heatmap(z=cm,
                                    text=cm,
                                    x=cm.columns,
                                    y=cm.index,
                                    texttemplate="%{text}",
                                    colorscale='Blues'))
    fig.show()

def get_label(s):
    return int(s.split('_label_')[1])

def get_true_and_pred(result_df):
    pred_col_names = [c for c in result_df.columns if 'predicted_label' in c ]
    true_col_names = [c for c in result_df.columns if 'true_label' in c ]
    y_pred = result_df[pred_col_names].idxmax(axis="columns").apply(get_label)
    true_probs = result_df[true_col_names]
    y_true = true_probs.idxmax(axis="columns").apply(get_label)

    result_df['pred_label'] = y_pred
    result_df['true_label'] = y_true
    result_df['pred_score'] = result_df[pred_col_names].max(axis=1)

    return y_true, y_pred

def get_worst_words_sents(result_df, src, true_label: int, pred_label: int, worst_words_cnt: int):
    one_inst_another = result_df[(result_df.true_label == true_label) & (
        result_df.pred_label == pred_label)]
    thrsh = one_inst_another[f'predicted_label_{pred_label}'].sort_values(
        ascending=False).head(worst_words_cnt).min()
    worst_mistakes_scores = one_inst_another[
        one_inst_another[f'predicted_label_{pred_label}'] >= thrsh]

    worst_words = (src[src.word_id.isin(worst_mistakes_scores.word_id)]
                   [['word_id', 'sentence_id', 'word']])[:worst_words_cnt]
    worst_sents = worst_words['sentence_id'].unique()
    worst_sents_df = src[src.sentence_id.isin(worst_sents)]
    # worst_sents_df.loc[worst_sents_df.index, 'pred_score'] = -1
    # worst_sents_df.loc[worst_sents_df[worst_sents_df.word_id.isin(worst_mistakes_scores.word_id)].index, "pred_score"] = one_inst_another.pred_score.values
    return worst_words, worst_sents_df

def get_best_words_sents(result_df, src, pred_label: int, words_cnt: int):
    """ 
    Находит слова, в которых сеть была уверена в ответе и ответ верный
    """
    correct_df = result_df[result_df.true_label==pred_label]
    thrsh = correct_df[f'predicted_label_{pred_label}'].sort_values(
        ascending=False).head(words_cnt).min()
    best_scores = correct_df[correct_df[f'predicted_label_{pred_label}']>=thrsh]
    best_words = (src[src.word_id.isin(best_scores.word_id)])[['word_id', 'sentence_id', 'word']][:words_cnt]
    best_sents = best_words.sentence_id.unique()
    best_sents_df = src[src.sentence_id.isin(best_sents)]
    return best_words, best_sents_df


In [2]:
project_name = 'agreementproject'
dataset_name = 'agreement_adj_mid50_0_declination'
bucket = 'agreementadjbucket'

Измененный бандл:
* Удалили slovnet
* Удалили ОЮ

### Filter bundle

In [3]:
new = {'ая', 'ого', 'ое', 'ой', 'ом', 'ому',
       'ую', 'ые', 'ый', 'ым', 'ыми', 'ых'} # тут нет окнчаний превосходных форм и ою

# полнейшей, наипрочнейшего, важнейшие,меньшим, милейший, наистраннейшее, новейших, малейшем, слабейшему, меньшими
good = {'ая', 'его', 'ее', 'ей', 'ем', 'ему',
        'ие', 'ий', 'им', 'ими', 'их', 'ую', 'яя', 'юю'}

big = {'ая', 'ие', 'им', 'ими', 'их', 'ого',
       'ое', 'ой', 'ом', 'ому', 'ою', 'ую'}

POSSIBLE_ENDINGS = set().union(new, good, big)
endings_nums = {e: i for i, e in enumerate(
    sorted(list(POSSIBLE_ENDINGS)))}
num_by_ending = endings_nums
ending_by_num = {v:k for k, v in endings_nums.items()}

new_declination_labels = {num for e, num in endings_nums.items() if e in new}
good_declination_labels = {num for e, num in endings_nums.items() if e in good}
big_declination_labels = {num for e, num in endings_nums.items() if e in big}


In [4]:
# ! python3 -m pip install eule

In [5]:
import eule
diagram = eule.euler({
    'new':list(new),
    'good':list(good),
    'big':list(big),
                      })
print(diagram)

ModuleNotFoundError: No module named 'eule'

В датасете только слова 1-го типа склонения. Новый. Возможно 12 окончаний. 

In [None]:
new_num_by_ending = {e:num for e,num in num_by_ending.items() if e in new}
new_num_by_ending

Оставили только слова типа "Новый"

In [None]:
from tg.common import DataBundle
from tg.common.ml.batched_training import IndexedDataBundle
from tg.grammar_ru.components.plain_context_builder import PlainContextBuilder
bundle_0_declination_path = Loc.data_cache_path/'bundles/agreement/mid50_0_declination'
# bundle_full_0_declination_path = Loc.data_cache_path/'bundles/agreement/full_mystemless_0_declination'


In [None]:
# db = DataBundle.load(Loc.data_cache_path/'bundles/agreement/mid50')
# ids_0_type=set(db.src[db.src.declension_type==0].word_id)
# db['index'] = db.index[db.index.word_id.isin(ids_0_type) & db.index.label.isin(new_declination_labels)]
# db = db.copy()
# db.save(bundle_0_declination_path)

In [None]:
db = DataBundle.load(Loc.data_cache_path/'bundles/agreement/mid50')
ids_0_type=set(db.src[db.src.declension_type==0].word_id)
db['index'] = db.index[db.index.word_id.isin(ids_0_type) & db.index.label.isin(new_declination_labels)]
db = db.copy()
db.save(bundle_0_declination_path)

Проверим отфильтрованный бандл

In [None]:
# del db
db = DataBundle.load(bundle_0_declination_path)


hist

In [None]:
import plotly.express as px
fig = px.histogram(db.index.label.replace(ending_by_num), histnorm=None)
fig.show()

In [None]:
db.src[db.src.word_id.isin(db.index.word_id)].declension_type.unique()

In [None]:
# Все возможные окончания слов 0-го типа склонения. "Новый"
db.index.label.replace(ending_by_num).unique()

Отправка бандла

In [None]:
project_name = 'agreementproject'
# dataset_name = 'agreement_adj_mid50_0_declination'#tiny_0_declination
dataset_name = 'agreement_adj_toy'

bucket = 'agreementadjbucket'

In [None]:
from tg.grammar_ru.components.yandex_storage.s3_yandex_helpers import S3YandexHandler
# try:
#     S3YandexHandler.create_bucket(bucket)
# except:
#     pass 

In [None]:
s3path = f'datasphere/{project_name}/datasets/{dataset_name}'
S3YandexHandler.upload_folder(bucket, s3path, bundle_0_declination_path)

# Обучение

Добавим в сеть кросс-энтропию.

cross-entropy & softmax 40ep

In [None]:
tasks = get_tasks(bucket, 'datasphere/agreementproject/job_info/job_agreementproject_17:01:33.277101.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                     #    normalize='true'
                     ).round(2),
    columns=[f'pred {n,ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {n,ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)


In [None]:
pred_col_names = [c for c in result_df.columns if 'predicted_label' in c ]
true_col_names = [c for c in result_df.columns if 'true_label' in c ]

In [None]:
# metrics[metrics.metric=='accuracy_display']

### SoftmaxLess

В документации сказано, что CrossEntropyLoss ожидает из сети ненормированные числа. Поэтому удалили softmax

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_18:32:21.476408.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

#### Выводы

Отлично обучилась, за исключением самого редкого класса - ОМУ

In [None]:
db = DataBundle.load(Loc.data_cache_path/'bundles/agreement/mid50')


In [None]:
from tg.grammar_ru.common import Separator

true_label = 15
pred_label = 20
worst_words, worst_sents_df = get_worst_words_sents(
    result_df, db.src, true_label=true_label, pred_label=pred_label, worst_words_cnt=4)
print(f"Predicted  {ending_by_num[pred_label]}  instead of  {ending_by_num[true_label]} " )
Separator.Viewer().tooltip("word_id").color('word_id',
                                            value_to_color={
                                                wid: 'red' for wid in worst_words.word_id}
                                            ).to_html_display(worst_sents_df)


In [None]:
result_df.pred_score.round(6).value_counts()

In [None]:
pred_col_names = [c for c in result_df.columns if 'predicted_label' in c ]
true_col_names = [c for c in result_df.columns if 'true_label' in c ]
# result_df[pred_col_names].round(2)[:20]

In [None]:
result_df[result_df.pred_score<0.001][pred_col_names]

In [None]:
#TODO context_size =7 
# todo weight of class in CE


In [None]:
from sklearn.metrics import classification_report

print(classification_report(result_df.true_label, result_df.pred_label))

### Добавим веса классов в кросс-энтропию

In [None]:
def get_class_weights(db):
    """ Normalize weights. Sum will be equal to number of classes"""
    weights = db.index.label.value_counts().sort_index()
    lw = torch.tensor(list(weights)).float()
    return (lw / lw.sum()) * len(lw)




In [None]:
bundle_0_declination_path = Loc.data_cache_path/'bundles/agreement/mid50_0_declination'
db = DataBundle.load(bundle_0_declination_path)


In [None]:
len(db.src)

In [None]:
get_class_weights(db)

CE weighted

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_07:23:35.273901.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

##### Выводы

Стало хуже.
Возможно потому, что распределение в батче (и тем более в мини-батче) отличается от распределения во всем датасете

#

Context size = 7

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_08:02:06.256541.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

In [None]:
plot_loss(metrics)

In [None]:
# TrainingLogsViewer.get_metric_by_job(metrics, 'loss')


In [None]:
#TODO когда менял размер контекста, заметил что assembly point создается дважды. создал 1 раз. Могут ли из-за этого быть такие артефакты

CE Smless twice AP CS=7

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_09:13:13.607354.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

In [None]:
# TrainingLogsViewer.get_metric_by_job(metrics, 'loss')


1 AP CS=7

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_08:55:47.079040.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

twice AP CS=6

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_10:46:51.835344.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

Вывод: примерно воспроизвели результат

twice AP CS=5

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_11:31:29.335750.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

Context = 20

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_08:39:36.084751.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

Выводы:
Увеличение контекста не позволило избавиться от неверного предсказания "ОМУ".
Влияет дисбаланс классов.

### GOOD

Бандл для "ХОРОШИЙ" 26000 слов в индексе. 2.5 млн в src

In [None]:
bundle_1_declination_path = Loc.data_cache_path/'bundles/agreement/mid50_1_declination'
db = DataBundle.load(bundle_1_declination_path)


In [None]:
del db

In [None]:
import plotly.express as px
fig = px.histogram(db.index.label.replace(ending_by_num), histnorm=None)
fig.show()

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_09:38:25.795641.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

Выводы:
аналогично "NEW" редкие классы не предсказываются

#### GOOD stratified

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_09:40:53.355391.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

#### GOOD 15

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_10:51:38.171789.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

### "NEW" Stratified

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_16:24:22.924230.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

In [None]:
from sklearn.metrics import classification_report

# print(classification_report(y_true, y_pred))

### "BIG" Stratified

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_11:59:32.942012.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)

#### Вывод:
Отлично обучилась. Надо выкинуть "ОЮ" и будет amazing success.

In [None]:
tasks = get_tasks(bucket,
 'datasphere/agreementproject/job_info/job_agreementproject_08:51:22.641978.txt')

loader = S3TrainingLogsLoader(bucket, project_name)
metrics = loader.load_metrics(tasks)

unzipped_folder = (Loc.root_path /
                   'temp'/'training_results' /
                   f'{tasks[0]}.unzipped')
result_df = pd.read_parquet(unzipped_folder/'output'/'result_df.parquet')
y_true, y_pred = get_true_and_pred(result_df)

plot_metrics(metrics, tasks[0])
sorted_nums = sorted(list(y_true.unique()))
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        # normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}' for n in sorted_nums]
)
plot_cm(cm)
cm = pd.DataFrame(
    confusion_matrix(y_true, y_pred,
                        normalize='true'
                     ).round(2),
    columns=[f'pred {ending_by_num[n]}({n})' for n in sorted_nums],
    index=[f'actual {ending_by_num[n]}({n})' for n in sorted_nums]
)
plot_cm(cm)