In [1]:
from FCA import BinaryFCAClassifier, format_formula_as_str


import pathlib

import numpy as np
import pandas as pd
from tqdm import notebook
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
base_path = pathlib.Path('habr_news_dataset')

train_df_path = pathlib.Path(base_path, 'train_df.csv')
test_df_path = pathlib.Path(base_path, 'test_df.csv')

train_df = pd.read_csv(train_df_path)
test_df = pd.read_csv(test_df_path)

In [3]:
def gen_corpus(df):
    for class_idx, row in notebook.tqdm(df.iterrows()):
        yield row['parsed'].split()

In [4]:
vectorizer_count = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, min_df=5)
vectorizer_tfidf = TfidfTransformer()

gen_corpus_train = gen_corpus(train_df)
sparse_train = vectorizer_count.fit_transform(gen_corpus_train)
X_train = vectorizer_tfidf.fit_transform(sparse_train)

gen_corpus_test = gen_corpus(test_df)
sparse_test = vectorizer_count.transform(gen_corpus_test)
X_test = vectorizer_tfidf.transform(sparse_test)



0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [5]:
train_df = train_df.fillna('')
test_df = test_df.fillna('')

from ast import literal_eval
train_df['topics'] = train_df['topics'].apply(lambda x: literal_eval(x))
test_df['topics'] = test_df['topics'].apply(lambda x: literal_eval(x))

In [6]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(train_df.topics)

y_train = multilabel_binarizer.transform(train_df.topics)
y_test = multilabel_binarizer.transform(test_df.topics)
class_names = multilabel_binarizer.classes_

In [7]:
class_names

array(['Big Data', 'IT-компании', 'Open source', 'Алгоритмы', 'Гаджеты',
       'Законодательство в IT', 'Информационная безопасность',
       'Искусственный интеллект', 'Машинное обучение',
       'Научно-популярное', 'Облачные сервисы', 'Программирование',
       'Работа с видео', 'Разработка веб-сайтов',
       'Разработка мобильных приложений', 'Разработка под Android',
       'Смартфоны', 'Софт', 'Социальные сети и сообщества',
       'Финансы в IT'], dtype=object)

In [8]:
def eval_metrics(y_true, y_pred):
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return f1, precision, recall


results = []
inverse_idx = None
feature_names = vectorizer_count.get_feature_names_out()


for class_idx, class_name in notebook.tqdm(enumerate(class_names)):
    clf = BinaryFCAClassifier()
    clf.fit(X_train, y_train[:, class_idx], inverse_idx=inverse_idx)

    formula = clf.get_formula(feature_names=feature_names)

    y_pred_train = clf.predict(X_train)
    f1_train, precision_train, recall_train = eval_metrics(y_train[:, class_idx], y_pred_train)

    y_pred_test = clf.predict(X_test)
    f1_test, precision_test, recall_test = eval_metrics(y_test[:, class_idx], y_pred_test)
    
    results.append({'class_name': class_name,
                    'formula': formula,
                    'f1 train': f1_train,
                    'f1 test': f1_test,
                    'precision train': precision_train,
                    'recall train': recall_train,
                    'precision test': precision_test,
                    'recall test': recall_test})

    inverse_idx = clf._inverse_idx

0it [00:00, ?it/s]

In [9]:
results_df = pd.DataFrame(results)

In [10]:
float_columns = ['f1 train', 'f1 test', 'precision train','recall train', 'precision test', 'recall test']

results_df[float_columns] = results_df[float_columns].apply(lambda x: round(x, 4))
results_df['formula'] = results_df['formula'].apply(format_formula_as_str)

In [11]:
from IPython.display import display, HTML
display(HTML( results_df.to_html().replace("\\n", "<br>") ))


Unnamed: 0,class_name,formula,f1 train,f1 test,precision train,recall train,precision test,recall test
0,Big Data,data || аналитик && задача && данные,0.4615,0.4583,0.3803,0.587,0.3667,0.6111
1,IT-компании,компания,0.6188,0.6169,0.5088,0.7897,0.5107,0.7789
2,Open source,github || исходный && код,0.533,0.5545,0.4302,0.7004,0.466,0.6846
3,Алгоритмы,алгоритм && исследователь || алгоритм && ия,0.3789,0.5,0.2951,0.5294,0.4217,0.614
4,Гаджеты,гаджет || устройство && представить,0.4891,0.4595,0.4297,0.5676,0.4198,0.5075
5,Законодательство в IT,суд || рф || закон,0.5256,0.5331,0.4048,0.749,0.4282,0.7061
6,Информационная безопасность,атака || безопасность || злоумышленник || уязвимость,0.5363,0.5458,0.4169,0.7517,0.4281,0.7527
7,Искусственный интеллект,ия || интеллект || искусственный || нейросеть,0.7386,0.7351,0.6447,0.8646,0.6476,0.85
8,Машинное обучение,машинный && обучение || машинный || обучение || нейросеть,0.5677,0.617,0.4202,0.8746,0.4819,0.8571
9,Научно-популярное,учёный || космический,0.6897,0.6818,0.6545,0.7289,0.6176,0.7609
