## Применение логистической регрессии к задаче классификации текстов

В этом ноутбуке мы попробуем проверить некоторые гипотезы относительно простой модели классификации - логистическая регрессия. Выдвигаем следующие гипотезы:

1. Качество модели можно увеличить за счет использования более эффективного метода перебора парметров - optuna

2. Качество модели можно увеличить, если обучить 2 модели: на всей выборке и на маленьких классах

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import scipy as sp
from scipy.sparse import csr_matrix
import random
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lentanewsmodel/train_200k_texts.csv
/kaggle/input/lentanewsmodel/ytrain_200k_texts.csv
/kaggle/input/lentanewsmodel/test_50k_texts.csv
/kaggle/input/lentanewsmodel/ytest_50k_texts.csv
/kaggle/input/lenta-news/lenta_news_with_lemmas2.csv


In [2]:
# тестовый датасет был сделан в другом ноутбуке
df_train = pd.read_csv('/kaggle/input/lentanewsmodel/train_200k_texts.csv')
df_train.shape

(207556, 15)

In [3]:
df_train = df_train[~(df_train['topic'].isin(['Библиотека', 'Оружие']))]
df_train.reset_index(drop=True, inplace=True)

dict_topic = dict(zip(df_train.topic, df_train.topic_le))
dict_topic = {k: v for k, v in sorted(dict_topic.items(), key=lambda item: item[1])}

In [4]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,date,url,topic,tags,text_str,topic_le,title_lemmas,year,month,day,date_enc,seanos,day_of_week,dummy_weekday
0,337202,2010-07-12,https://lenta.ru/news/2010/07/12/explosion/,Россия,Все,запад москва квартира этажный дом происходить ...,20,дом запад москва происходить взрыв газ,2010,7,12,3966,3,0,1
1,460023,2013-03-29,https://lenta.ru/news/2013/03/29/samogon/,Бывший СССР,,украина предлагать вводить уголовный ответстве...,3,украинский самогонщик пригрозить уголовный отв...,2013,3,29,4958,2,4,1
2,340676,2010-08-09,https://lenta.ru/news/2010/08/09/aliev/,Интернет и СМИ,Все,улица москва близкий время появляться социальн...,7,андрей тихонов сергей игнашевич рекламировать ...,2010,8,9,3993,3,0,1
3,88483,2004-02-28,https://lenta.ru/news/2004/02/28/chita/,Россия,Все,ход расследование взрыв читинский кафе онон за...,20,чита задерживать предполагать виновник взрыв кафе,2004,2,28,1642,1,5,0
4,939735,2021-06-18,https://lenta.ru/news/2021/06/18/iron/,Ценности,Явления,эксперт раскрывать альтернативный способ погла...,25,эксперт раскрывать способ погладить одежда утюг,2021,6,18,7957,3,4,1


In [5]:
df_test = pd.read_csv('/kaggle/input/lentanewsmodel/test_50k_texts.csv')
df_test = df_test[~(df_test['topic'].isin(['Библиотека', 'Оружие']))]
df_test.reset_index(drop=True, inplace=True)

In [6]:
y_train = df_train['topic_le']

In [7]:
%%time

vec = TfidfVectorizer(ngram_range=(1, 1), max_df=0.8)
bow = vec.fit_transform(df_train['text_str'])

CPU times: user 30.1 s, sys: 679 ms, total: 30.8 s
Wall time: 30.9 s


In [8]:
bow.shape

(207543, 291274)

Также добавим еще 2 признака: год публикации статьи и сезон, оба параметра закодируем при помощи OneHotEncoding

In [9]:
one_hot_enc = OneHotEncoder(drop='first', handle_unknown='ignore')

arr_enc = one_hot_enc.fit_transform(df_train[['seanos', 'year']])

X = sp.sparse.hstack((bow, arr_enc))
X.shape

(207543, 291300)

In [10]:
#  отбросим стат тестом
select = SelectKBest(chi2, k=100_000)
X = select.fit_transform(X, y_train)

In [12]:
bow_test = vec.transform(df_test['text_str'])

dummies = one_hot_enc.transform(df_test[['seanos', 'year']])
X_test = sp.sparse.hstack((bow_test, dummies))
X_test = select.transform(X_test)
X_test.shape

(41510, 100000)

### Гипотеза: optuna может улучшить качество модели, за счет более эффективного перебора параметров

In [18]:
import optuna


def objective(trial):
    params = {
        'C' : trial.suggest_float("C", 1e-2, 1e2, log=True),
        'warm_start' : trial.suggest_categorical('warm_start' , [True, False]),
        'class_weight' : trial.suggest_categorical('class_weight' , [None, 'balanced']),
        'solver' : trial.suggest_categorical('solver' , ['lbfgs', 'sag']),
        }
    model = LogisticRegression(**params, random_state=42, max_iter=500, n_jobs=-1)
    model.fit(X, y_train)
    y_pred = model.predict(X)
    f1_train = f1_score(y_train, y_pred, average='weighted')
    return f1_train


optuna.logging.set_verbosity(optuna.logging.INFO)
study = optuna.create_study(direction = 'maximize', study_name = 'log')
study.optimize(objective, show_progress_bar=True, n_trials=15)



[32m[I 2023-03-05 08:42:47,926][0m A new study created in memory with name: log[0m
  self._init_valid()


  0%|          | 0/15 [00:00<?, ?it/s]

[32m[I 2023-03-05 08:43:47,542][0m Trial 0 finished with value: 0.6769665344767493 and parameters: {'C': 0.016950076709611038, 'warm_start': True, 'class_weight': None, 'solver': 'sag'}. Best is trial 0 with value: 0.6769665344767493.[0m
[32m[I 2023-03-05 08:58:38,112][0m Trial 1 finished with value: 0.9399759803676567 and parameters: {'C': 4.879178098779043, 'warm_start': False, 'class_weight': None, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.9399759803676567.[0m


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[32m[I 2023-03-05 09:30:14,251][0m Trial 2 finished with value: 0.6044411749755323 and parameters: {'C': 0.34952030256707295, 'warm_start': True, 'class_weight': 'balanced', 'solver': 'sag'}. Best is trial 1 with value: 0.9399759803676567.[0m




[32m[I 2023-03-05 10:03:45,283][0m Trial 3 finished with value: 0.8209022206862999 and parameters: {'C': 65.50234215199846, 'warm_start': False, 'class_weight': 'balanced', 'solver': 'sag'}. Best is trial 1 with value: 0.9399759803676567.[0m




[32m[I 2023-03-05 10:37:41,997][0m Trial 4 finished with value: 0.6308065162459621 and parameters: {'C': 0.01673270924367785, 'warm_start': False, 'class_weight': 'balanced', 'solver': 'sag'}. Best is trial 1 with value: 0.9399759803676567.[0m




[32m[I 2023-03-05 11:11:30,890][0m Trial 5 finished with value: 0.9909723524829019 and parameters: {'C': 44.9140134319888, 'warm_start': True, 'class_weight': None, 'solver': 'sag'}. Best is trial 5 with value: 0.9909723524829019.[0m
[32m[I 2023-03-05 11:26:29,931][0m Trial 6 finished with value: 0.8956526964180908 and parameters: {'C': 1.3996730056989883, 'warm_start': True, 'class_weight': None, 'solver': 'lbfgs'}. Best is trial 5 with value: 0.9909723524829019.[0m
[32m[I 2023-03-05 11:41:16,491][0m Trial 7 finished with value: 0.9864596751468523 and parameters: {'C': 49.18232354053548, 'warm_start': False, 'class_weight': None, 'solver': 'lbfgs'}. Best is trial 5 with value: 0.9909723524829019.[0m
[32m[I 2023-03-05 11:55:17,443][0m Trial 8 finished with value: 0.9674999758380345 and parameters: {'C': 43.66437097415465, 'warm_start': True, 'class_weight': 'balanced', 'solver': 'lbfgs'}. Best is trial 5 with value: 0.9909723524829019.[0m


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[32m[I 2023-03-05 12:28:09,270][0m Trial 9 finished with value: 0.9633285023276706 and parameters: {'C': 10.584428228469674, 'warm_start': False, 'class_weight': None, 'solver': 'sag'}. Best is trial 5 with value: 0.9909723524829019.[0m
[32m[I 2023-03-05 12:30:59,515][0m Trial 10 finished with value: 0.8212540403370832 and parameters: {'C': 0.15462167743438468, 'warm_start': True, 'class_weight': None, 'solver': 'sag'}. Best is trial 5 with value: 0.9909723524829019.[0m
[32m[I 2023-03-05 12:46:21,889][0m Trial 11 finished with value: 0.9930015075139178 and parameters: {'C': 95.13124511469547, 'warm_start': False, 'class_weight': None, 'solver': 'lbfgs'}. Best is trial 11 with value: 0.9930015075139178.[0m
[32m[I 2023-03-05 13:00:52,456][0m Trial 12 finished with value: 0.9681110423822213 and parameters: {'C': 12.72260456017527, 'warm_start': True, 'class_weight': None, 'solver': 'lbfgs'}. Best is trial 11 with value: 0.9930015075139178.[0m
[32m[I 2023-03-05 13:15:32,001][

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [13]:
%%time

params = study.best_params

clf = LogisticRegression(random_state=42, max_iter=500, n_jobs=-1, **params)
y_train = df_train['topic_le']

X = csr_matrix(X)

clf.fit(X, y_train)

CPU times: user 208 ms, sys: 314 ms, total: 522 ms
Wall time: 14min 13s


LogisticRegression(C=95.13124511469547, max_iter=500, n_jobs=-1,
                   random_state=42)

In [14]:
y_test_pred = clf.predict(X_test)
f1_score(df_test['topic_le'], y_test_pred, average='micro')

0.8235124066490003

In [15]:
f1_score(df_test['topic_le'], y_test_pred, average='weighted')

0.8232664752930992

In [16]:
print(classification_report(df_test['topic_le'], y_test_pred, target_names=dict_topic))

                   precision    recall  f1-score   support

   69-я параллель       0.84      0.60      0.70       121
           Бизнес       0.69      0.54      0.61       268
      Бывший СССР       0.84      0.84      0.84      3376
              Дом       0.87      0.80      0.83       797
    Забота о себе       0.87      0.84      0.86       114
         Из жизни       0.72      0.68      0.70      1613
   Интернет и СМИ       0.77      0.74      0.75      2333
             Крым       0.75      0.50      0.60        24
    Культпросвет        0.60      0.25      0.35        12
         Культура       0.87      0.87      0.87      2584
          Легпром       0.00      0.00      0.00         4
              Мир       0.79      0.83      0.81      7048
       Моя страна       0.88      0.83      0.86       221
  Наука и техника       0.87      0.86      0.87      2722
       Нацпроекты       0.95      0.92      0.93       170
      Путешествия       0.87      0.84      0.85       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


По сравнению с предыдущим ноутбуком качество даже ухудшилось (было 0,844)

### Гипотеза: Если отдельно обучить модель на маленьких классах и остальных, качество предсказания на маленьких классах - улучшится

In [13]:
df_train['topic_le_small'] = df_train['topic_le']
df_train.loc[~df_train['topic'].isin(['Легпром', 'Крым', 'Культпросвет ']),'topic_le_small'] = 42
y_train_sm = df_train['topic_le_small']

In [14]:
y_train_sm.value_counts()

42    207341
8        120
9         61
11        21
Name: topic_le_small, dtype: int64

In [21]:
%%time

import optuna


def objective(trial):
    params = {
        'C' : trial.suggest_float("C", 1e-2, 1e2, log=True),
        'class_weight' : trial.suggest_categorical('class_weight' , [None, 'balanced']),
        'solver' : trial.suggest_categorical('solver' , ['lbfgs', 'sag']),
        }
    model = LogisticRegression(**params, random_state=42, max_iter=500, n_jobs=-1)
    model.fit(X, y_train_sm)
    y_pred = model.predict(X)
    f1_train = f1_score(y_train_sm, y_pred, average='micro')
    return f1_train


optuna.logging.set_verbosity(optuna.logging.INFO)
study = optuna.create_study(direction = 'maximize', study_name = 'log_small',
                           pruner=optuna.pruners.MedianPruner())
study.optimize(objective, show_progress_bar=True, n_trials=10)

[32m[I 2023-03-04 21:29:09,197][0m A new study created in memory with name: log_small[0m
  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2023-03-04 21:31:04,114][0m Trial 0 finished with value: 1.0 and parameters: {'C': 70.17359007392896, 'class_weight': None, 'solver': 'lbfgs'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-03-04 21:32:59,896][0m Trial 1 finished with value: 1.0 and parameters: {'C': 45.312981098301265, 'class_weight': None, 'solver': 'lbfgs'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-03-04 21:33:19,216][0m Trial 2 finished with value: 0.9990267077184005 and parameters: {'C': 0.028430852275116242, 'class_weight': None, 'solver': 'sag'}. Best is trial 0 with value: 1.0.[0m




[32m[I 2023-03-04 21:41:56,044][0m Trial 3 finished with value: 1.0 and parameters: {'C': 13.14488953000403, 'class_weight': None, 'solver': 'sag'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-03-04 21:43:36,745][0m Trial 4 finished with value: 0.9991086184549708 and parameters: {'C': 0.44370850877123963, 'class_weight': None, 'solver': 'sag'}. Best is trial 0 with value: 1.0.[0m




[32m[I 2023-03-04 21:52:10,137][0m Trial 5 finished with value: 0.8410498065461134 and parameters: {'C': 0.07514554487168314, 'class_weight': 'balanced', 'solver': 'sag'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-03-04 21:53:39,838][0m Trial 6 finished with value: 0.9988917959169907 and parameters: {'C': 1.73576453002948, 'class_weight': 'balanced', 'solver': 'lbfgs'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-03-04 21:55:21,518][0m Trial 7 finished with value: 0.999344714107438 and parameters: {'C': 4.012903600705897, 'class_weight': 'balanced', 'solver': 'lbfgs'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-03-04 21:55:39,487][0m Trial 8 finished with value: 0.9990267077184005 and parameters: {'C': 0.024544956028134106, 'class_weight': None, 'solver': 'sag'}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-03-04 21:58:05,957][0m Trial 9 finished with value: 0.9999903634427565 and parameters: {'C': 98.11819961355323, 'class_weight': 'balanced', 'solver':

In [22]:
study.best_params

{'C': 70.17359007392896, 'class_weight': None, 'solver': 'lbfgs'}

In [15]:
%%time

params = {'C': 70.17359007392896, 'class_weight': None, 'solver': 'lbfgs'}

clf_small = LogisticRegression(random_state=42, max_iter=500, n_jobs=-1, **params)
y_train_sm = df_train['topic_le_small']

X = csr_matrix(X)

clf_small.fit(X, y_train_sm)

CPU times: user 62.4 ms, sys: 171 ms, total: 233 ms
Wall time: 1min 15s


LogisticRegression(C=70.17359007392896, max_iter=500, n_jobs=-1,
                   random_state=42)

In [16]:
y_test_pred = clf_small.predict(X)
f1_score(y_train_sm, y_test_pred, average='micro')

1.0

In [20]:
print(classification_report(y_train_sm, y_test_pred, target_names={'Крым': 8, 'Культпросвет ': 9, 'Легпром': 11, 'Остальные классы': 42}))

                  precision    recall  f1-score   support

            Крым       1.00      1.00      1.00       120
   Культпросвет        1.00      1.00      1.00        61
         Легпром       1.00      1.00      1.00        21
Остальные классы       1.00      1.00      1.00    207341

        accuracy                           1.00    207543
       macro avg       1.00      1.00      1.00    207543
    weighted avg       1.00      1.00      1.00    207543



In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_sm, y_test_pred)

array([[   120,      0,      0,      0],
       [     0,     61,      0,      0],
       [     0,      0,     21,      0],
       [     0,      0,      0, 207341]])

Соединим их

In [45]:
y_test_pred = clf.predict_proba(X_test)
y_test_pred_small = clf_small.predict_proba(X_test)

In [48]:
# находим максимум в маленькой модели, если он приходится на последний класс, то идем в большую модель и там берем максимум
# в другом случае берем нашу маленькую модель и сравниваем максимум в ней и в большой модели и берем наибольший

# y_pred_new = [-1]*X_test.shape[0]
y_pred_new = clf.predict(X_test)
# y_pred_new = [clf_small.classes_[np.argmax(y_test_pred_small[i])]
#               for i in range(X_test.shape[0]) if np.argmax(y_test_pred_small[i]) != 3]


for i in range(X_test.shape[0]):
    max_small_index = np.argmax(y_test_pred_small[i])
    if max_small_index != 3:
        y_pred_new[i] = clf_small.classes_[max_small_index]

In [49]:
f1_score(df_test['topic_le'], y_pred_new, average='micro')

0.8264514574801253

In [50]:
print(classification_report(df_test['topic_le'], y_pred_new, target_names=dict_topic))

                   precision    recall  f1-score   support

   69-я параллель       0.85      0.60      0.70       121
           Бизнес       0.69      0.54      0.61       268
      Бывший СССР       0.85      0.84      0.84      3376
              Дом       0.86      0.80      0.83       797
    Забота о себе       0.85      0.82      0.84       114
         Из жизни       0.72      0.68      0.70      1613
   Интернет и СМИ       0.78      0.74      0.76      2333
             Крым       0.72      0.54      0.62        24
    Культпросвет        0.60      0.25      0.35        12
         Культура       0.87      0.87      0.87      2584
          Легпром       1.00      0.50      0.67         4
              Мир       0.80      0.83      0.81      7048
       Моя страна       0.89      0.83      0.86       221
  Наука и техника       0.87      0.86      0.87      2722
       Нацпроекты       0.96      0.89      0.92       170
      Путешествия       0.87      0.84      0.85       

Эта гипотеза сработала лучше, однако все равно по качетсву уступаем нашей лучшей модели - Catboost