In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

from IPython.display import Image
from IPython.core.display import HTML 

In [2]:
from collections import defaultdict

In [3]:
import re

In [4]:
dataset_ok = pd.read_csv("../../dataset_ok.csv")

In [5]:
dataset_ok.head(10)

Unnamed: 0,text,label
0,"–Ω–∞–µ–±–∞–ª–æ–≤–æ –≤–µ–∫–∞, –¥–ª—è –¥–æ–ª–±–∞—ë–±–æ–≤\n",INSULT
1,–≤—Å—è –¥—É–º–∞ –≤ —Ç–∞–∫–æ–º –∂–µ –ø–æ–ª–æ–∂–µ–Ω–∏–∏üòÅ\n,NORMAL
2,–∞ –≤ –∫–∞–∫–æ–º –º–µ—Å—Ç–µ –º–∞—Å—Å–æ–≤–æ–µ —Å—Ç–æ–ª–∫–Ω–æ–≤–µ–Ω–∏–µ? —à—Ä–∞–π–±–∏–∫...,NORMAL
3,"–∑–Ω–∞—á–∏—Ç –ª–∏ —ç—Ç–æ, —á—Ç–æ –∫–æ–Ω—Ç—Ä–æ–ª—å –∑–∞ –≤—ã–≤–æ–∑–æ–º –∫—Ä—É–ø–Ω–æ–≥...",NORMAL
4,–≤–∞–º –Ω–µ –Ω—É–∂–µ–Ω —â–µ–Ω–æ—á–µ–∫? –æ—á–µ–Ω—å —Ö–æ—Ä–æ—à–∏–µ üê∂ü•∞\n,NORMAL
5,"–æ–Ω, —Ö–æ—Ç—å –∂–∏–≤–æ–π –æ—Å—Ç–∞–ª—Å—è??.\n",NORMAL
6,–±—ã–ª–æ –¥–µ–ª–æ.\n,NORMAL
7,"—Å —Ö—Ä–∞–Ω–µ–Ω–∏–µ–º –Ω–µ—Ç –ø—Ä–æ–±–ª–µ–º, –∏–ª–∏ —Ç–æ–ª—å–∫–æ –≤ —Ö–æ–ª–æ–¥–∏–ª—å...",NORMAL
8,–ø–æ–ª–Ω–æ—Å—Ç—å—é –≤–∞—Å –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞—é\n,NORMAL
9,—ç—Ç–æ—Ç —Ä–µ—Ü–µ–ø—Ç –Ω–µ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç —Ñ–æ—Ç–æ. –æ–æ–æ—Ö –∏ –Ω–∞–º—É...,NORMAL


# –£–ø—Ä–æ—â—ë–Ω–Ω—ã–π BPE

In [4]:
with open("../../lenta.txt", encoding="UTF-8") as file:
    corpus_lenta = file.read()

In [36]:
def count_pairs(corpus_joined: list):
    corpus_split = corpus_joined.split(" ")
    counts = defaultdict()
    for i in range(len(corpus_split) - 1):
        if (corpus_split[i], corpus_split[i+1]) in counts:
            counts[corpus_split[i], corpus_split[i+1]] += 1
        else:
            counts[corpus_split[i], corpus_split[i+1]] = 1
    return counts

def bpe_initialize(corpus: str, n_iterations: int, k_pairs: int):
    merged_vocab = set()
    punct = re.compile(r'[ .,:!;+=?""'']')
    corpus = re.sub(punct, "", corpus)
    corpus_symbols = [sym for sym in corpus]
    corpus_joined = " ".join(corpus_symbols)
    for n in range(n_iterations):
        pairs = count_pairs(corpus_joined)
        top_k = sorted(list(pairs.keys()), key=pairs.get, reverse=True)[:k_pairs]
        for pair in top_k:
            new_pair = pair[0] + pair[1]
            corpus_joined = re.sub(f"{pair[0]} {pair[1]}", new_pair, corpus_joined)
            merged_vocab.add(new_pair)
    return merged_vocab

In [39]:
print(list(bpe_initialize(corpus_lenta, 2, 50)))

['–∞–Ω', '–º–∞', '—Å—Ç–≤', '–≤–∞', '–∞–º', '—Ä—É', '—Å–æ', '–ª–∏', '—Ü–∏', '–Ω—ã—Ö', '—Ç–æ', '–µ–π', '–æ–º', '–≤–∏', '—á–∞', '—Ç–∞', '–æ–∂', '–µ—Ä', '–≤—ã', '–∏–ª', '—É—é', '–∞–µ—Ç', '–æ—Ç', '–∞—è', '—â–∏', '–Ω–æ–π', '—Ä–µ', '–æ—Ä', '—Å—Ç', '–∫–∏', '–µ–¥', '–æ—Å', '–∏–Ω', '–∞–≤', '–Ω–∞', '–Ω—ã', '–∫–∞', '–µ–º', '—É–¥', '–µ—á', '–ø—Ä', '–∏—Ö', '–∏–º', '–¥–∞', '–ø–∞', '—É–ø', '–æ–Ω', '–∞—Ä', '–≥–æ', '–ª–∞', '–æ–≥', '–¥–∏', '–Ω–∏', '–∫—É', '–±—ã', '–æ–±', '–∑–∞', '–º–∏', '–µ–Ω', '–µ–≤', '–µ—Å', '—Å–∏', '–µ—Ç', '–∏—Ç', '–∞—Ç', '—è–≤', '—É—á', '–ø–æ', '–ª—å', '–æ–¥', '–∏–∏', '–∏—Å', '–ª—è', '–∏–∑', '–Ω–µ', '–µ–ª', '–ª–µ', '—Ä–æ', '–∞—Å', '—Ä–∏', '–æ–±—â', '—Å—è', '—Ä–∞', '—Ü–∏–∏', '—É–∂', '–∫–æ', '–∏–π', '–æ–æ–±', '—Ç–µ', '–∞–ª', '–∞–∫', '—Å–∫–∏', '–æ–ª', '–≤–æ', '–¥–µ', '—Å–∫', '–∞–∑', '—Ç–∏', '–æ–≤', '–Ω–æ']


### –ö–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π: —Å—Ä–µ–¥–∏ —á–∞—Å—Ç–æ—Ç–Ω—ã—Ö –ø–∞—Ä –º–Ω–æ–≥–æ —Ç–∞–∫–∏—Ö, –∫–æ—Ç–æ—Ä—ã–µ —Å–æ–æ—Ç–≤–µ—Å—Ç–≤—É—é—Ç —Ä–∞—Å–ø—Ä–æ—Å—Ç—Ä–∞–Ω—ë–Ω–Ω—ã–º –º–æ—Ä—Ñ–µ–º–∞–º: "—É—é", "—Å—è", "–µ—Ç". –¢–∞–∫–∂–µ –≤—Å—Ç—Ä–µ—á–∞—é—Ç—Å—è —Ñ—Ä–∞–≥–º–µ–Ω—Ç—ã —á–∞—Å—Ç–æ—Ç–Ω—ã—Ö —Å–ª–æ–≤, –≤—ã—Ö–æ–¥—è—â–∏–µ –∑–∞ –ø—Ä–µ–¥–µ–ª—ã –æ—Ç–¥–µ–ª—å–Ω–æ–π –º–æ—Ä—Ñ–µ–º—ã: "—Ü–∏–∏", "–Ω–æ–π", "–Ω—ã—Ö", "–∞–µ—Ç".

In [38]:
print(list(bpe_initialize(corpus_lenta, 5, 20)))

['–∞–Ω', '–±—É', '—Ö–æ', '–º–∞', '–≤–∞', '—Ä—É', '—Å–æ', '—Å–∫–æ', '–¥–æ', '–ø–æ–¥', '–ª–∏', '—Ü–∏', '–≥–∞', '–Ω—ã—Ö', '—Ç–æ', '–µ–π', '–≤–∏', '—á–∞', '–ø—Ä–∏', '—Ç–∞', '–µ—Ä', '–≤—ã', '–∏–ª', '–∫–æ–º', '–æ—Ç', '–ø—É', '–ª–æ', '–Ω–æ–π', '—Ä–µ', '–æ—Ä', '—Å—Ç', '–∫–∏', '–µ–¥', '–æ—Å', '—Ç—å', '–∏–Ω', '–Ω–æ–≥–æ', '–Ω–∞', '–Ω—ã', '–¥—É', '–∫–∞', '–µ–º', '–µ—á', '–ø—Ä', '–∏—Ö', '–¥–∞', '–ø–∞', '–±–æ', '—ç—Ç–æ', '–≥–æ', '–ª–∞', '–¥–∏', '–Ω–∏', '–∫—É', '–±—ã', '–æ–±', '–∑–∞', '–º–∏', '–µ–Ω', '–µ–≤', '–µ—Å', '—Å–∏', '–µ—Ç', '–∏—Ç', '–º–æ', '—è–≤', '–ø–æ', '–ª—å', '–∏–∏', '–∏—Å', '–ª—è', '–∏–∑', '–Ω–µ', '–æ–≤', '–µ–ª', '–ª–µ', '—Ä–æ', '—Ä–∏', '–æ–±—â', '—Å—è', '—Ä–∞', '—É–∂', '–∫–æ', '–∏–π', '—Ç—ã', '—Ç–µ', '–∞–ª', '—á—Ç–æ', '–Ω—É', '–º—É', '—Å–∞', '–∞–∫', '–ª—É', '—Å–∫–∏', '—Ä–∞–∑', '–≤–æ', '–¥–µ', '—Ç–∏', '—Å—Ç–≤', '–Ω–æ']


### –ö–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π: –∫—Ä–æ–º–µ –º–æ—Ä—Ñ–µ–º –∏ —Ñ—Ä–∞–≥–º–µ–Ω—Ç–æ–≤ —Å–ª–æ–≤, –≤ —Å–ø–∏—Å–∫–µ –ø–æ—è–≤–∏–ª–∏—Å—å –æ—Ç–¥–µ–ª—å–Ω—ã–µ —á–∞—Å—Ç–æ—Ç–Ω—ã–µ —Å–ª—É–∂–µ–±–Ω—ã–µ —Å–ª–æ–≤–∞: "—á—Ç–æ", "—Ä–∞–∑", "—ç—Ç–æ". –°–∞–º—ã–π –¥–ª–∏–Ω–Ω—ã–π —Ç–æ–∫–µ–Ω - "–Ω–æ–≥–æ" —Å 4 —Å–∏–º–≤–æ–ª–∞–º–∏.

In [40]:
print(list(bpe_initialize(corpus_lenta, 10, 20)))

['–∞–Ω', '–≥–ª–∞', '—Å–∫–æ–≥–æ', '–ø—Ä–∞–≤', '—Å–∫–æ–π', '–±—É', '–¥–ª—è', '—Ö–æ', '—Ç—É', '–º–∞', '–≤–∞', '–ª—é', '–µ–∂', '—Ä—É', '—Å–µ', '—Å–≤–æ', '–µ—Ç—Å—è', '–µ–≥–æ', '–ø–æ–ª', '—Å–æ', '—Å–∫–æ', '–¥–æ', '–ø–æ–¥', '–ª–∏', '—Ü–∏', '–≥–∞', '—Å—É', '–Ω–æ–º', '–Ω—ã—Ö', '–∂–∏', '—Ç–æ', '–ö–∞–∫', '–º–∏–ª–ª–∏', '–Ω–∏–∫', '–µ–π', '–º–æ–∂', '–≤–∏', '–ø–∏', '—á–∞', '–ø—Ä–∏', '—Ç–∞', '–≥—Ä–∞', '–Ω–∞—è', '–µ—Ä', '–≤—É', '–≤—ã', '–∏–ª', '–¥–æ–ª', '–±–∏', '–∫–æ–º', '–∑–∏', '–ß–µ—á', '–∞–µ—Ç', '–æ—Ç', '–ø—É', '–ª–æ', '—Å–∫–∞', '–ê–ù', '–Ω–æ–π', '—Ä–µ', '–æ—Ä', '—Å—Ç', '00', '–∑—ã', '–∫–∏', '—Ç–∞–∫–∂', '–ù–∞', '–µ–¥', '–±–ª–∏', '–±–∞', '–æ—Å', '—Ç—å', '–∏–Ω', '—Ü–∏—è', '–Ω–æ–≥–æ', '—Ä—è', '–∞–≤', '–Ω–∞', '–Ω—ã', '–¥—É', '–∫–æ–π', '–∫–∞', '–µ–º', '–≤—è', '–≤–µ—Ä', '–µ—á', '–ø—Ä', '–∏—Ö', '–º–µ', '–¥–∞', '–ø–∞', '–Ω—è', '—à–µ–Ω–∏', '–±–æ', '—Ñ–∏', '–†–æ—Å—Å–∏–∏', '–Ω—ã–º', '—Å–æ–æ–±—â', '–ò–Ω', '—ç—Ç–æ', '—Å–≤', '—ç—Ç–æ–º', '–ü–æ', '–æ–Ω', '—Ñ–æ—Ä', '–∞—Ä', '–≥–æ', '–ª–∞', '–¥–∏', '—á–∏', '–Ω–∏', '–ò

### –ø–æ—è–≤–∏–ª–∏—Å—å –¥–ª–∏–Ω–Ω—ã–µ –æ—Ç—Ä—ã–≤–∫–∏ —á–∞—Å—Ç–æ—Ç–Ω—ã—Ö —Å–ª–æ–≤: "—á–µ–ª–æ–≤", "–ø—Ä–µ–∑–∏–¥–µ–Ω", "–†–æ—Å—Å–∏–∏". –°–∞–º—ã–π –¥–ª–∏–Ω–Ω—ã–π —Ç–æ–∫–µ–Ω - "—Å–æ–æ–±—â–∞–µ—Ç".

In [48]:
final_vocab = bpe_initialize(corpus_lenta, 10, 50)

In [42]:
text = """–° –ø–æ–º–æ—â—å—é —Å–∞–Ω–∫—Ü–∏–π –ø—Ä–æ—Ç–∏–≤ —Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–π –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ü–∏—è –°–®–ê 
–ø—ã—Ç–∞–µ—Ç—Å—è ¬´–ø–Ω—É—Ç—å –∏ —Ç–∞–∫ –Ω–∞—Ö–æ–¥—è—â–∏–µ—Å—è –≤ –ø–ª–æ—Ö–æ–π —Ñ–æ—Ä–º–µ —Ä–æ—Å—Å–∏–π—Å–∫–æ-–∞–º–µ—Ä–∏–∫–∞–Ω—Å–∫–∏–µ –æ—Ç–Ω–æ—à–µ–Ω–∏—è¬ª, 
–∑–∞—è–≤–∏–ª –ø—Ä–µ—Å—Å-—Å–µ–∫—Ä–µ—Ç–∞—Ä—å –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–∞ –†–æ—Å—Å–∏–∏ –î–º–∏—Ç—Ä–∏–π –ü–µ—Å–∫–æ–≤, –ø–µ—Ä–µ–¥–∞–µ—Ç –∫–æ—Ä—Ä–µ—Å–ø–æ–Ω–¥–µ–Ω—Ç –†–ë–ö. 
¬´–≠—Ç–æ –æ—á–µ—Ä–µ–¥–Ω–æ–π –≤—Ä–∞–∂–¥–µ–±–Ω—ã–π —à–∞–≥ –ø–æ –æ—Ç–Ω–æ—à–µ–Ω–∏—é –∫ –†–æ—Å—Å–∏–∏. –ú–æ–∂–µ–º —Ç–æ–ª—å–∫–æ —Å–æ–∂–∞–ª–µ—Ç—å, 
—á—Ç–æ –æ—á–µ—Ä–µ–¥–Ω–∞—è —É—Ö–æ–¥—è—â–∞—è –∞–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ü–∏—è –°–®–ê –ø—Ä–µ–¥–ø–æ—á–∏—Ç–∞–µ—Ç –ø–Ω—É—Ç—å –∏ —Ç–∞–∫ –Ω–∞—Ö–æ–¥—è—â–∏–µ—Å—è –≤ –ø–ª–æ—Ö–æ–π —Ñ–æ—Ä–º–µ 
—Ä–æ—Å—Å–∏–π—Å–∫–æ-–∞–º–µ—Ä–∏–∫–∞–Ω—Å–∫–∏–µ –æ—Ç–Ω–æ—à–µ–Ω–∏—è¬ª, ‚Äî —Å–∫–∞–∑–∞–ª –ü–µ—Å–∫–æ–≤."""

In [46]:
def bpe_tokenize(text: str, vocab: set):
    punct = re.compile(r'[ .,:!;+=?""'']')
    bag_of_tokens = []
    text = re.sub(punct, "", text)
    order = sorted(list(vocab), key=len, reverse=True)
    for token in order:
        if token in text:
            bag_of_tokens.append(token)
    return bag_of_tokens

In [49]:
print(bpe_tokenize(text, final_vocab))

['–º–∏–Ω–∏—Å—Ç—Ä–∞', '–ø—Ä–æ—Ç–∏–≤', '–µ—Ä–∏–∫–∞–Ω', '—Ä–æ—Å—Å–∏–π', '–æ—Å—Å–∏–∏', '–ø—Ä–µ—Å—Å', '–ø–æ–º–æ—â', '—Å–∫–∏–µ', '–∏–¥–µ–Ω', '–µ–Ω–∏—è', '–µ—Ç—Å—è', '—à–µ–Ω–∏', '—á–µ—Ä', '–∞–µ—Ç', '–µ–Ω—Ç', '–ø–µ—Ä', '—á—Ç–æ', '—Å–∫–∏', '–º–µ—Ä', '–†–æ—Å', '—Ö–æ–¥', '—Ñ–æ—Ä', '–Ω–æ–π', '—Ü–∏—è', '–æ—Ä–≥', '—Ä–∞–∂', '–Ω—ã–π', '—Å–æ', '–µ—Ä', '–æ–π', '–æ—Ç', '—â–∏', '–∫–∞', '–º–µ', '—á–∏', '–µ—Å', '—Å–∏', '–∏—Ç', '–∏—Å', '–µ–∫', '—Å—è', '—Ä–∞', '–Ω—É', '–∞–∫', '–∞–º', '–≥–∞', '—Ç–æ', '—Ç–∞', '—à–µ', '–æ–∂', '–∂–µ', '–ª–æ', '–∞–≥', '—Å—Ç', '–∏—é', '—Å-', '–∏–Ω', '–Ω–∞', '–∏—Ö', '–¥–∞', '–∑–∞', '–º–∏', '–ø–æ', '–æ–¥', '–∏–∏', '–∏–∑', '—Ä–∏', '–æ—à', '–∞–ª', '—Ç–∏', '–∞–Ω', '—å—é', '–æ–º', '–≤–∏', '–ø–µ', '–∞–¥', '–∏–ª', '–∞—è', '–æ—Ä', '—É—Ç', '—Ç—å', '–Ω—ã', '–ø—Ä', '–ø—ã', '–æ–Ω', '–µ–Ω', '–µ—Ç', '–µ–∑', '—è–≤', '—Ä–æ', '—É—Ö', '–∏–π', '–∞—Ö', '–æ—â', '—Å–∫', '–∞–∑', '–æ–≤', '—Å–µ', '–®–ê', '—Ü–∏', '–∑–∏', '–∏–≤', '—Ä–µ', '–∫–∏', '–µ–¥', '–æ—Å', '–µ–º', '–∞—Ä', '–Ω–∏', '–æ—Ö', '–ª—å', '–æ—á', 

### –ö–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π: –ø—Ä–∏ –ø—Ä–∏–º–µ–Ω–µ–Ω–∏–∏ BPE –±–µ–∑ –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω–æ–π –ø–æ—Å–ª–æ–≤–Ω–æ–π —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ –º–Ω–æ–≥–æ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω–æ —Ç–µ—Ä—è–µ—Ç—Å—è

# –†–µ–∞–ª–∏–∑–∞—Ü–∏—è TF-IDF

In [6]:
from tokenizers import CharBPETokenizer, Tokenizer

In [7]:
from collections import Counter

In [51]:
dataset_ok['text'].to_csv('corpus_new.txt', index=None)

In [8]:
tok_sub = CharBPETokenizer()
tok_sub.train('corpus_new.txt', vocab_size=6000, min_frequency=10,)

In [9]:
tok_sub.encode(dataset_ok.loc[1, 'text']).tokens

['–≤—Å—è</w>',
 '–¥—É',
 '–º–∞</w>',
 '–≤</w>',
 '—Ç–∞–∫–æ–º</w>',
 '–∂–µ</w>',
 '–ø–æ–ª–æ',
 '–∂–µ–Ω–∏',
 '–∏',
 'üòÅ</w>']

In [10]:
from scipy.sparse import lil_matrix

### –ü–æ–¥—Å—á—ë—Ç TF –∏ DF

In [11]:
tokenized_texts = [Counter(tok_sub.encode(dataset_ok.loc[i, 'text']).tokens) for i in range(dataset_ok.shape[0])]

In [13]:
voc = tok_sub.get_vocab()

In [14]:
document_frequency = dict()
tokens = list(voc.keys())
for term in tokens:
    for count in tokenized_texts:
        if term not in count:
            continue
        if term in document_frequency:
            document_frequency[term] += 1
        else:
            document_frequency[term] = 1

In [15]:
document_frequency['–≤</w>']

16441

### –ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ –º–∞—Ç—Ä–∏—Ü—ã

In [16]:
tf_idf_matrix = lil_matrix((dataset_ok.shape[0], 6000), dtype=np.float32)

In [17]:
Image(url="https://miro.medium.com/max/3604/1*qQgnyPLDIkUmeZKN2_ZWbQ.png",
     width=500, height=500)

### –ü–æ–ª—å–∑—É–µ–º—Å—è –∏–Ω–¥–µ–∫—Å–∞–º–∏ –≤ —Å–ª–æ–≤–∞—Ä–µ –≤ –∫–∞—á–µ—Å—Ç–≤–µ –∏–Ω–¥–µ–∫—Å–æ–≤ –∫–æ–ª–æ–Ω–æ–∫

In [18]:
n_docs = dataset_ok.shape[0] + 1 # for computing idf
for index in range(len(tokenized_texts)):
    count = tokenized_texts[index]
    for item in count.keys():
        col_index = voc[item]
        tf = count[item] / sum(count.values())
        idf = n_docs / (document_frequency[item] + 1)
        result = tf * np.log1p(idf)
        tf_idf_matrix[index, col_index] = result

### –û–±—É—á–µ–Ω–∏–µ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä–∞

In [19]:
y = np.array(dataset_ok.label)

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [94]:
clf_log = LogisticRegression(C=1, max_iter=120, n_jobs=3)
cross_val_score(clf_log, tf_idf_matrix, y, scoring="f1_macro", cv=StratifiedKFold(n_splits=5, shuffle=True))

array([0.77072889, 0.77983193, 0.79112207, 0.77533286, 0.76180761])

In [22]:
clf_NB = MultinomialNB()
cross_val_score(clf_NB, tf_idf_matrix, y, scoring="f1_macro", cv=StratifiedKFold(n_splits=5, shuffle=True))

array([0.58090091, 0.56539705, 0.54981246, 0.59286224, 0.59702563])

### –ö–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π: –ø–æ—Å–∫–æ–ª—å–∫—É –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è –º–µ—Ç—Ä–∏–∫–∞ "f1_macro", —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –≤—ã–≥–ª—è–¥—è—Ç –ø–ª–æ—Ö–∏–º–∏. –¢–µ–º –Ω–µ –º–µ–Ω–µ–µ –ª–æ–≥–∏—Å—Ç–∏—á–µ—Å–∫–∞—è —Ä–µ–≥—Ä–µ—Å—Å–∏—è –ø—Ä–µ–≤–æ—Å—Ö–æ–¥–∏—Ç –ø–æ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—É –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä, –æ–±—É—á–µ–Ω–Ω—ã–π –Ω–∞ —Å–µ–º–∏–Ω–∞—Ä–µ, –∞ –Ω–∞–∏–≤–Ω—ã–π –±–∞–π–µ—Å –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç —Å—Ö–æ–¥–Ω—ã–π —Å –Ω–∏–º –ø–æ –∫–∞—á–µ—Å—Ç–≤—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç.

In [22]:
cross_val_score(clf, X, y, scoring="f1_macro", cv=StratifiedKFold(n_splits=5, shuffle=True))

array([0.50043239, 0.50446208, 0.51254333, 0.50846296, 0.51108045])