In [1]:
import polars as pl
from datasets import load_dataset, Dataset
import re
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, EvalPrediction
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    accuracy_score,
    multilabel_confusion_matrix,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
import matplotlib.pyplot as plt
import torch
import pandas as pd
from functools import partial
from src.config import compose_config
from hydra.utils import instantiate
from typing import Any

pd.set_option("display.max_rows", 500)

In [2]:
cfg = compose_config(overrides=["+model_init.problem_type='multi_label_classification'"])
prepr_conf = cfg["preprocessing"]
train_conf = cfg["training"]
model_conf = cfg["model_init"]



In [3]:
frame = pl.read_csv("data/practice_cleaned.csv")
frame

Направление,Факультет,ID студента,Оценка,Категория,Тег,Комментарий,Статус
str,i64,i64,i64,str,str,str,str
"""C""",113,1493,1,"""Видео""","""VP2""","""Видео лагает""",
"""C""",113,5580,5,"""ДЗ""","""H3 D""","""Торгом Бабаян!…",
"""E""",126,5619,5,"""ДЗ""","""H3""","""Спасибо)""",
"""E""",123,310,3,"""ДЗ""","""H2 E1""","""комментарий со…",
"""E""",123,1913,5,"""ДЗ""","""H3 D""","""Жонибек, хочу …",
"""E""",123,4169,5,"""ДЗ""","""H3""","""Все круто. Жон…",
"""E""",123,4169,5,"""ДЗ""","""H3""","""Валерия- отлич…",
"""E""",109,,3,,"""S3 H3 VC2""","""Кураторы работ…",
"""E""",123,,3,,"""T2 E1""","""""Расскажу конк…",
"""E""",126,5537,1,"""Видео""","""VP2""","""Автор не объяс…",


In [4]:
frame[['Комментарий', 'Категория', 'Тег']].to_pandas()

Unnamed: 0,Комментарий,Категория,Тег
0,Видео лагает,Видео,VP2
1,Торгом Бабаян! Спасибо вам большое за помощь в...,ДЗ,H3 D
2,Спасибо),ДЗ,H3
3,комментарий содержит нерелевантную информацию ...,ДЗ,H2 E1
4,"Жонибек, хочу Вас поблагодарить за ваши советы...",ДЗ,H3 D
...,...,...,...
56124,требуемый формат иконок платный,ДЗ,H2
56125,заплатила и дальше просто никому нет дела поче...,,S1
56126,Крайне раздражают некоторые детали)\nНапример ...,,LMS
56127,321.Профессия Бизнес-аналитик\nАналитик данных...,,VC2 VP2


In [5]:
frame = frame.with_columns(
    (pl.col("Тег").apply(lambda x: " ".join(re.findall(r"[A-Z]{1,2}\d|LMS", x)))).alias("corrected_tag")
    )

  (pl.col("Тег").apply(lambda x: " ".join(re.findall(r"[A-Z]{1,2}\d|LMS", x)))).alias("corrected_tag")


In [6]:
null_filter = (
    (pl.col("corrected_tag").eq(""))
)

frame = frame.filter(~null_filter)
frame = frame.filter(~(pl.col("Комментарий").is_null()))

In [7]:
frame = frame.with_columns(
    pl.col("corrected_tag")
    .str.replace_all(r"VC4|VP4|VC5|S4|T4|H4|EA1", "")
    .str.strip()
    .str.replace(r"\s\s+", " ")
    .str.replace(r"GH3", "H3")
    .str.replace(r"HH3", "H3")
    .str.replace(r"BP3", "VP3")
    .str.replace(r"V3", "VC3")
    .str.replace(r"V2", "VP2")
)

frame = frame.filter(~(pl.col("corrected_tag").eq("")))

  .str.strip()


In [8]:
frame["corrected_tag"].str.split(by = " ").explode().value_counts(sort=True)

corrected_tag,counts
str,u32
"""H3""",20750
"""VC2""",14524
"""VC3""",8267
"""VP3""",5082
"""VP2""",4925
"""VC1""",3810
"""H2""",2757
"""E1""",1799
"""VP1""",1695
"""H1""",1310


In [9]:
frame = frame.filter(~pl.col("corrected_tag").str.contains("E2"))

In [10]:
frame

Направление,Факультет,ID студента,Оценка,Категория,Тег,Комментарий,Статус,corrected_tag
str,i64,i64,i64,str,str,str,str,str
"""C""",113,1493,1,"""Видео""","""VP2""","""Видео лагает""",,"""VP2"""
"""C""",113,5580,5,"""ДЗ""","""H3 D""","""Торгом Бабаян!…",,"""H3"""
"""E""",126,5619,5,"""ДЗ""","""H3""","""Спасибо)""",,"""H3"""
"""E""",123,310,3,"""ДЗ""","""H2 E1""","""комментарий со…",,"""H2 E1"""
"""E""",123,1913,5,"""ДЗ""","""H3 D""","""Жонибек, хочу …",,"""H3"""
"""E""",123,4169,5,"""ДЗ""","""H3""","""Все круто. Жон…",,"""H3"""
"""E""",123,4169,5,"""ДЗ""","""H3""","""Валерия- отлич…",,"""H3"""
"""E""",109,,3,,"""S3 H3 VC2""","""Кураторы работ…",,"""S3 H3 VC2"""
"""E""",123,,3,,"""T2 E1""","""""Расскажу конк…",,"""T2 E1"""
"""E""",126,5537,1,"""Видео""","""VP2""","""Автор не объяс…",,"""VP2"""


In [11]:
def remove_sub_tags(tags: str):
    split = tags.split(sep=" ")
    new_tag = [x[:-1] if x[-1].isdigit() else x for x in split]
    return " ".join(new_tag)

frame = frame.with_columns(
    pl.col("corrected_tag").apply(remove_sub_tags)
)

frame["corrected_tag"].str.split(by = " ").explode().value_counts(sort=True).transpose(column_names='corrected_tag').to_pandas()

  pl.col("corrected_tag").apply(remove_sub_tags)


Unnamed: 0,VC,H,VP,S,E,T,LMS
0,26601,24816,11702,1981,1799,1187,671


In [12]:
target = frame["corrected_tag"].str.split(by = " ").explode().unique().sort().to_list()
target = dict(zip(target, range(len(target))))
reverse_target = {v : k for k, v in target.items()}

In [13]:
def vectorize(tags: str) -> list[float]:
    """Turn str with tags into list with digit labels.

    Args:
        tags (str): tag text representation.

    Returns:
        list[float]: numeric labels.
    """
    split = tags.split(sep = " ")
    res = np.zeros(len(target))
    for x in split:
        res[target[x]] = 1
    return res.tolist()

In [14]:
frame = frame.with_columns(pl.col("corrected_tag").apply(vectorize).alias("labels"))

  frame = frame.with_columns(pl.col("corrected_tag").apply(vectorize).alias("labels"))


In [15]:
clear_frame = frame.select(
    pl.col("Комментарий"),
    pl.col("Категория"),
    pl.col("corrected_tag"),
    pl.col("labels"),
    pl.col("corrected_tag").str.split(by=" ").alias("temp"),
)
clear_frame = clear_frame.explode(columns=["temp"])

train_df, test_df = train_test_split(
    clear_frame.to_pandas(),
    test_size=cfg["test_size"],
    random_state=1337
    
)

train_df = train_df.drop(columns=["corrected_tag", "temp"])
test_df = test_df.drop(columns=["corrected_tag", "temp"])

train_df = train_df.rename({"Комментарий": "text"})
test_df = test_df.rename({"Комментарий": "text"})

In [16]:
clear_frame.to_pandas()

Unnamed: 0,Комментарий,Категория,corrected_tag,labels,temp
0,Видео лагает,Видео,VP,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]",VP
1,Торгом Бабаян! Спасибо вам большое за помощь в...,ДЗ,H,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",H
2,Спасибо),ДЗ,H,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",H
3,комментарий содержит нерелевантную информацию ...,ДЗ,H E,"[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",H
4,комментарий содержит нерелевантную информацию ...,ДЗ,H E,"[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",E
...,...,...,...,...,...
68752,Крайне раздражают некоторые детали)\nНапример ...,,LMS,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",LMS
68753,321.Профессия Бизнес-аналитик\nАналитик данных...,,VC VP,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]",VC
68754,321.Профессия Бизнес-аналитик\nАналитик данных...,,VC VP,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]",VP
68755,"Системный аналитик с нуля, 1-14 модули. Не нар...",,VP VC,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]",VP


In [17]:
clear_frame.to_pandas().to_csv('for_clust.csv', index=False)

In [18]:
data = frame.to_pandas()

In [19]:
train_df, test_df = train_test_split(
    data,
    test_size=cfg["test_size"],
    random_state=1337
    
)

train_df = train_df[['Категория', 'Тег', 'Комментарий', 'labels']]
test_df = test_df[['Категория', 'Тег', 'Комментарий', 'labels']]

train_df = train_df.rename(columns={"Комментарий": "text", "Тег": "tag", "Категория": "old_cat"})
test_df = test_df.rename(columns={"Комментарий": "text", "Тег": "tag", "Категория": "old_cat"})



In [20]:
train_dataset = Dataset.from_pandas(train_df, split="train")
test_dataset = Dataset.from_pandas(test_df, split="test")

In [21]:
tokenizer = AutoTokenizer.from_pretrained(cfg['preprocessing']['tokenizer_name'])


def preprocess_data(sample: dict[str, Any]) -> dict[str, Any]:
    """Encode input text into sequence of tokens.
    Also add corresponding labels.

    Args:
        sample (dict[str, Any]): raw input text.

    Returns:
        dict[str, Any]: transformed sample with tokenized text and labels.
    """
    text = sample["text"]
    encoding = tokenizer(
        text,
        padding=cfg["preprocessing"]['padding'],
        truncation=True,
        max_length=cfg["preprocessing"]['max_length'],
    )
    encoding["labels"] = sample["labels"]
    return encoding

In [22]:
encoded_train = train_dataset.map(
    preprocess_data, batched=True, remove_columns=train_dataset.column_names
)
encoded_test = test_dataset.map(
    preprocess_data, batched=True, remove_columns=test_dataset.column_names
)
encoded_train.set_format("torch")
encoded_test.set_format("torch")

Map:   0%|          | 0/44344 [00:00<?, ? examples/s]

Map:   0%|          | 0/11087 [00:00<?, ? examples/s]

In [23]:
def multi_label_metrics(
    predictions: np.ndarray, labels: np.ndarray, threshold: float = 0.5
) -> dict[str, float]:
    """Compute mltilabel metrics.

    Args:
        predictions (np.ndarray): logits array
        labels (np.ndarray): labels array
        threshold (float, optional): activation threshold. Defaults to 0.5.

    Returns:
        dict[str, float]: metrics dict
    """
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    f1_micro_average = f1_score(y_true=labels, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(labels, y_pred, average="micro")
    accuracy = accuracy_score(labels, y_pred)
    metrics = {"f1": f1_micro_average, "roc_auc": roc_auc, "accuracy": accuracy}
    return metrics, labels, y_pred

In [24]:
def compute_metrics(p: EvalPrediction) -> dict[str, float]:
    """Metrics computation wrapper.

    Args:
        p (EvalPrediction): hf model output

    Returns:
        dict[str, float]: metrics dict
    """
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result, _, _ = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [25]:
cfg = compose_config(overrides=["+model_init.problem_type='multi_label_classification'"])
prepr_conf = cfg["preprocessing"]
train_conf = cfg["training"]
model_conf = cfg["model_init"]



In [26]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("cointegrated/rubert-tiny2", num_labels=7, problem_type="multi_label_classification").to("cuda")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# trainer.train()

In [75]:
model = AutoModelForSequenceClassification.from_pretrained('./my_awesome_model/checkpoint-20628')
tokenizer = AutoTokenizer.from_pretrained('./my_awesome_model/checkpoint-20628')

In [76]:
training_args = TrainingArguments(

   output_dir="my_awesome_model",
   learning_rate=2e-5,
   per_device_train_batch_size=8,
   per_device_eval_batch_size=8,
   num_train_epochs=3,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

In [77]:
trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=encoded_train,
    eval_dataset=encoded_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [78]:
train_preds = trainer.predict(encoded_train)
test_preds = trainer.predict(encoded_test)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/5543 [00:00<?, ?it/s]

  0%|          | 0/1386 [00:00<?, ?it/s]

In [79]:
compute_metrics(train_preds)

{'f1': 0.8722556928418258,
 'roc_auc': 0.924828743751808,
 'accuracy': 0.7835332852246076}

In [32]:
new_cat = pd.read_csv('new_cat.csv')

In [33]:
train_df['preds_0'] = train_preds[0][:, 0]
train_df['preds_1'] = train_preds[0][:, 1]
train_df['preds_2'] = train_preds[0][:, 2]
train_df['preds_3'] = train_preds[0][:, 3]
train_df['preds_4'] = train_preds[0][:, 4]
train_df['preds_5'] = train_preds[0][:, 5]
train_df['preds_6'] = train_preds[0][:, 6]

In [34]:
train_df

Unnamed: 0,old_cat,tag,text,labels,preds_0,preds_1,preds_2,preds_3,preds_4,preds_5,preds_6
9377,ДЗ,H3 D,"Николай, приятно познакомиться! Спасибо за нап...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.051570,0.019468,-0.025783,-0.055930,-0.003244,-0.072109,-0.025292
2656,,H3,"Практика, общение с куратором","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.007330,-0.045410,-0.003297,-0.067342,0.013983,-0.035488,0.004150
49520,,LMS VP2,оформление сайт\nподача. \nужасный ведущий.офо...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]",-0.013820,0.029749,0.008873,-0.085765,0.054634,0.019689,0.003333
23247,Видео,VC3,Понятно.,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",0.109894,-0.043964,0.120985,0.115053,-0.084684,0.005785,-0.015501
21023,Видео,VP2,"Очень скучная подача материала от спикера, я в...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]",-0.070027,0.066926,-0.040695,-0.057923,0.120768,0.107781,0.001260
...,...,...,...,...,...,...,...,...,...,...,...
21977,ДЗ,H3,Спасибо большое за мотивацию!,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.054710,-0.019325,0.125094,0.028030,-0.039574,0.018109,-0.018581
19623,ДЗ,H3,"Спасибо) было очень много работы, ушла в отпус...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.032402,0.010390,0.059049,0.023550,0.045462,-0.079602,0.016894
9448,ДЗ,H3,Спасибо!),"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.066567,-0.024556,0.091109,0.074026,-0.070817,-0.000701,-0.073659
33628,ДЗ,H3,Всё хорошо!,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.118276,-0.036058,0.072733,0.093432,-0.130428,0.058558,-0.063824


In [35]:
test_df['preds_0'] = test_preds[0][:, 0]
test_df['preds_1'] = test_preds[0][:, 1]
test_df['preds_2'] = test_preds[0][:, 2]
test_df['preds_3'] = test_preds[0][:, 3]
test_df['preds_4'] = test_preds[0][:, 4]
test_df['preds_5'] = test_preds[0][:, 5]
test_df['preds_6'] = test_preds[0][:, 6]

In [36]:
train_df['new_cat'] = new_cat['category']

In [37]:
train_df

Unnamed: 0,old_cat,tag,text,labels,preds_0,preds_1,preds_2,preds_3,preds_4,preds_5,preds_6,new_cat
9377,ДЗ,H3 D,"Николай, приятно познакомиться! Спасибо за нап...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.051570,0.019468,-0.025783,-0.055930,-0.003244,-0.072109,-0.025292,Видео
2656,,H3,"Практика, общение с куратором","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.007330,-0.045410,-0.003297,-0.067342,0.013983,-0.035488,0.004150,Видео
49520,,LMS VP2,оформление сайт\nподача. \nужасный ведущий.офо...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]",-0.013820,0.029749,0.008873,-0.085765,0.054634,0.019689,0.003333,ДЗ
23247,Видео,VC3,Понятно.,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",0.109894,-0.043964,0.120985,0.115053,-0.084684,0.005785,-0.015501,Видео
21023,Видео,VP2,"Очень скучная подача материала от спикера, я в...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]",-0.070027,0.066926,-0.040695,-0.057923,0.120768,0.107781,0.001260,Видео
...,...,...,...,...,...,...,...,...,...,...,...,...
21977,ДЗ,H3,Спасибо большое за мотивацию!,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.054710,-0.019325,0.125094,0.028030,-0.039574,0.018109,-0.018581,ДЗ
19623,ДЗ,H3,"Спасибо) было очень много работы, ушла в отпус...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.032402,0.010390,0.059049,0.023550,0.045462,-0.079602,0.016894,
9448,ДЗ,H3,Спасибо!),"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.066567,-0.024556,0.091109,0.074026,-0.070817,-0.000701,-0.073659,
33628,ДЗ,H3,Всё хорошо!,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.118276,-0.036058,0.072733,0.093432,-0.130428,0.058558,-0.063824,Общение с куратором


In [38]:
test_df['new_cat'] = new_cat['category']

In [39]:
train_df

Unnamed: 0,old_cat,tag,text,labels,preds_0,preds_1,preds_2,preds_3,preds_4,preds_5,preds_6,new_cat
9377,ДЗ,H3 D,"Николай, приятно познакомиться! Спасибо за нап...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.051570,0.019468,-0.025783,-0.055930,-0.003244,-0.072109,-0.025292,Видео
2656,,H3,"Практика, общение с куратором","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.007330,-0.045410,-0.003297,-0.067342,0.013983,-0.035488,0.004150,Видео
49520,,LMS VP2,оформление сайт\nподача. \nужасный ведущий.офо...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]",-0.013820,0.029749,0.008873,-0.085765,0.054634,0.019689,0.003333,ДЗ
23247,Видео,VC3,Понятно.,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",0.109894,-0.043964,0.120985,0.115053,-0.084684,0.005785,-0.015501,Видео
21023,Видео,VP2,"Очень скучная подача материала от спикера, я в...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]",-0.070027,0.066926,-0.040695,-0.057923,0.120768,0.107781,0.001260,Видео
...,...,...,...,...,...,...,...,...,...,...,...,...
21977,ДЗ,H3,Спасибо большое за мотивацию!,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.054710,-0.019325,0.125094,0.028030,-0.039574,0.018109,-0.018581,ДЗ
19623,ДЗ,H3,"Спасибо) было очень много работы, ушла в отпус...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",-0.032402,0.010390,0.059049,0.023550,0.045462,-0.079602,0.016894,
9448,ДЗ,H3,Спасибо!),"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.066567,-0.024556,0.091109,0.074026,-0.070817,-0.000701,-0.073659,
33628,ДЗ,H3,Всё хорошо!,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.118276,-0.036058,0.072733,0.093432,-0.130428,0.058558,-0.063824,Общение с куратором


In [40]:
pd.get_dummies(train_df['old_cat'])

Unnamed: 0,Видео,ДЗ,Интерфейс платформы,Качество материалов,Лонгрид,Общение с куратором,Тест
9377,False,True,False,False,False,False,False
2656,False,False,False,False,False,False,False
49520,False,False,False,False,False,False,False
23247,True,False,False,False,False,False,False
21023,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...
21977,False,True,False,False,False,False,False
19623,False,True,False,False,False,False,False
9448,False,True,False,False,False,False,False
33628,False,True,False,False,False,False,False


In [41]:
train_df_old = pd.concat([train_df, pd.get_dummies(train_df['old_cat'])], axis=1)

In [42]:
test_df_old = pd.concat([test_df, pd.get_dummies(test_df['old_cat'])], axis=1)

In [43]:
train_df_new = pd.concat([train_df, pd.get_dummies(train_df['new_cat'])], axis=1)
test_df_new = pd.concat([test_df, pd.get_dummies(test_df['new_cat'])], axis=1)

In [44]:
train_df.columns


Index(['old_cat', 'tag', 'text', 'labels', 'preds_0', 'preds_1', 'preds_2',
       'preds_3', 'preds_4', 'preds_5', 'preds_6', 'new_cat'],
      dtype='object')

In [46]:
y_train = train_df_old['labels']
X_train_old = train_df_old.drop(columns=['old_cat', 'tag', 'text', 'labels', 'new_cat'])
X_train_new = train_df_new.drop(columns=['old_cat', 'tag', 'text', 'labels', 'new_cat'])

y_test = test_df_old['labels']
X_test_old = test_df_old.drop(columns=['old_cat', 'tag', 'text', 'labels', 'new_cat'])
X_test_new = test_df_new.drop(columns=['old_cat', 'tag', 'text', 'labels', 'new_cat'])


In [62]:
y_train = pd.DataFrame(data = {
    '0': y_train.apply(lambda x: x[0]),
    '1': y_train.apply(lambda x: x[1]),
    '2': y_train.apply(lambda x: x[2]),
    '3': y_train.apply(lambda x: x[3]),
    '4': y_train.apply(lambda x: x[4]),
    '5': y_train.apply(lambda x: x[5]),
    '6': y_train.apply(lambda x: x[6])
    })

In [63]:
y_test = pd.DataFrame(data = {
    '0': y_test.apply(lambda x: x[0]),
    '1': y_test.apply(lambda x: x[1]),
    '2': y_test.apply(lambda x: x[2]),
    '3': y_test.apply(lambda x: x[3]),
    '4': y_test.apply(lambda x: x[4]),
    '5': y_test.apply(lambda x: x[5]),
    '6': y_test.apply(lambda x: x[6])
    })

In [64]:
import numpy as np
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
clf = MultiOutputClassifier(estimator= LogisticRegression()).fit(X_train_old, y_train)
old_preds = clf.predict(X_test_old)

In [65]:
f1_score(y_true=y_test, y_pred=old_preds, average="micro")

0.767161895559045

In [67]:
clf = MultiOutputClassifier(estimator= LogisticRegression()).fit(X_train_new, y_train)
new_preds = clf.predict(X_test_new)

In [68]:
f1_score(y_true=y_test, y_pred=new_preds, average="micro")

0.5271688587750764

{'f1': 0.20382919426575297,
 'roc_auc': 0.43738097982805785,
 'accuracy': 0.004803355583618979}