Рассмотрим задачу классификации текстов BBC на 5 категорий: business, entertainment, politics, sport, tech.
Используем тематическую модель ARTM и в качестве эмбеддингов документов возьмем распределение документов по темам.
Для классификации воспользуемся градиентным бустингом.
Цель этого ноутбука сравнить эмбеддинги тематической модели, эмбеддинги doc2vec и их конкатенацию

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import nltk
import artm
from collections import Counter
nltk.download('wordnet')
nltk.download('stopwords')

данные возьмем с kaggle соревнования https://www.kaggle.com/c/learn-ai-bbc

In [2]:
train_df = pd.read_csv('BBC News Train.csv')

In [3]:
train_df.head(10)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
5,1582,howard truanted to play snooker conservative...,politics
6,651,wales silent on grand slam talk rhys williams ...,sport
7,1797,french honour for director parker british film...,entertainment
8,2034,car giant hit by mercedes slump a slump in pro...,business
9,1866,fockers fuel festive film chart comedy meet th...,entertainment


посмотрим на баланс классов

In [16]:
train_df['Category'].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

проведем предобработку текста: лемматизацию, удалим стоп-слова и разобьем на n-граммы

In [5]:
import string
def filter_and_tokenize(text):
    text = text.lower().replace('\n', ' ')
    text = ' '.join([t for t in text.split(' ')
                        if not t.startswith('@') and not t.startswith('http') and not t.startswith('www')])

    for p in string.punctuation:
        text = text.replace(p, ' ')

    return [t for t in text.split(' ') if not(t.isnumeric()) and len(t) > 1]
train_df['Text'] = train_df['Text'].apply(filter_and_tokenize)

In [6]:
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()
def lemm(tokens):
    return [lemmatizer.lemmatize(t) for t in tokens]

train_df['Text'] = train_df['Text'].apply(lemm)

In [7]:
stopwords = set(nltk.corpus.stopwords.words('english') + ['wa', 'ha'])

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stopwords]

train_df['Text'] = train_df['Text'].apply(remove_stopwords)

In [8]:
def n_grams(arr):
    bag = []
    for i in range(len(arr)):
        bag.append(arr[i])
        if i < len(arr) - 1:
            bag.append(arr[i]+"_"+arr[i+1])
        if i < len(arr) - 2:
            bag.append(arr[i]+"_"+arr[i+1]+"_"+arr[i+2])
    return bag
train_df['Text'] = train_df['Text'].apply(n_grams)

In [9]:
token_to_counter = Counter()
for row in train_df['Text']:
    token_to_counter.update(row)
token_to_counter.most_common(30)

[('said', 4839),
 ('year', 2172),
 ('mr', 2007),
 ('would', 1714),
 ('also', 1426),
 ('new', 1338),
 ('people', 1325),
 ('one', 1277),
 ('u', 1264),
 ('time', 1067),
 ('could', 1032),
 ('game', 963),
 ('first', 935),
 ('last', 893),
 ('two', 889),
 ('say', 845),
 ('film', 832),
 ('world', 823),
 ('uk', 780),
 ('government', 777),
 ('make', 711),
 ('company', 683),
 ('firm', 675),
 ('best', 644),
 ('get', 626),
 ('service', 620),
 ('number', 619),
 ('told', 591),
 ('month', 590),
 ('three', 584)]

In [10]:
vw_filaname = 'texts.vw.txt'
Counter = 0
with open(vw_filaname, 'w') as fout:
    for row in train_df.iterrows():
        tokens = row[1][1]
        ID = row[1][0]
        fout.write('{} {} \n'.format(ID, " ".join(tokens)))

In [11]:
bv = artm.BatchVectorizer(data_path=vw_filaname, data_format='vowpal_wabbit', batch_size=10000, target_folder='batches')

построим тематическую модель

In [12]:
model = artm.ARTM(num_topics=30, cache_theta = True, num_document_passes=10, dictionary=bv.dictionary, class_ids={'@default_class': 1.0})
model.scores.add(artm.PerplexityScore(name='perplexity', dictionary=bv.dictionary, class_ids=['@default_class']))
model.scores.add(artm.TopTokensScore(name='top-tokens', num_tokens=15))
model.scores.add(artm.SparsityPhiScore(name='sparsity', class_id='@default_class'))
model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_def',class_ids=['@default_class'], tau=2e+5))

In [13]:
for i in range(16):
    model.fit_offline(bv, num_collection_passes=1)
    print(f'Iter #{i}, perplexity: {model.score_tracker["perplexity"].last_value}, sparsity: {model.score_tracker["sparsity"].last_value}')

Iter #0, perplexity: 533075.3125, sparsity: 0.0
Iter #1, perplexity: 105412.03125, sparsity: 0.0004982234095223248
Iter #2, perplexity: 38084.25, sparsity: 0.6386135220527649
Iter #3, perplexity: 18662.138671875, sparsity: 0.8575142025947571
Iter #4, perplexity: 15485.009765625, sparsity: 0.9145038723945618
Iter #5, perplexity: 14497.2646484375, sparsity: 0.9365740418434143
Iter #6, perplexity: 14244.81640625, sparsity: 0.9458815455436707
Iter #7, perplexity: 14093.7548828125, sparsity: 0.950194776058197
Iter #8, perplexity: 14045.1728515625, sparsity: 0.9526783227920532
Iter #9, perplexity: 14008.3427734375, sparsity: 0.9544075131416321
Iter #10, perplexity: 14004.0126953125, sparsity: 0.9554308652877808
Iter #11, perplexity: 13994.330078125, sparsity: 0.9561180472373962
Iter #12, perplexity: 13992.599609375, sparsity: 0.956582248210907
Iter #13, perplexity: 13986.0478515625, sparsity: 0.9569264054298401
Iter #14, perplexity: 13984.3154296875, sparsity: 0.9571753740310669
Iter #15, pe

In [14]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
X = train_df
y = train_df['Category']
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X['Text'])]
d2vModel = Doc2Vec(documents, vector_size=60, window=20, min_count=10, workers=8, random_state = 42)

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


получим точность классификации с помощью кросс-валидации и сравним результаты

In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import lightgbm as lgb
k = 5
skf = StratifiedKFold(n_splits=k)

X_topic = model.get_theta().to_numpy().T
avg_concat = 0
avg_topic = 0
avg_doc2vec = 0
for train, test in skf.split(X, y):
    y_tr = y[train].to_numpy().T
    y_tst = y[test].to_numpy().T
    X_tr_t = X_topic[train]
    X_tst_t = X_topic[test]
    X_tr_doc = np.array([d2vModel.infer_vector(u) for u in X.iloc[train]['Text']])
    X_tst_doc = np.array([d2vModel.infer_vector(u) for u in X.iloc[test]['Text']])
    X_tr_con = np.concatenate((X_tr_doc,X_tr_t), axis=1)
    X_tst_con = np.concatenate((X_tst_doc,X_tst_t), axis=1)
    lgbm1 = lgb.LGBMClassifier(n_estimators=100, reg_alpha=0.01, learning_rate=0.2, class_weight='balanced', num_leaves = 20, max_depth = 6)
    lgbm1.fit(X_tr_doc, y_tr)
    y_pred = lgbm1.predict(X_tst_doc)
    acc =  metrics.accuracy_score(y_pred, y_tst)
    avg_doc2vec += acc
    
    lgbm2 = lgb.LGBMClassifier(n_estimators=100, reg_alpha=0.01, learning_rate=0.2, class_weight='balanced', num_leaves = 20, max_depth = 6)
    lgbm2.fit(X_tr_t, y_tr)
    y_pred = lgbm2.predict(X_tst_t)
    acc =  metrics.accuracy_score(y_pred, y_tst)
    avg_topic += acc
    
    lgbm3 = lgb.LGBMClassifier(n_estimators=100, reg_alpha=0.01, learning_rate=0.2, class_weight='balanced', num_leaves = 20, max_depth = 6)
    lgbm3.fit(X_tr_con, y_tr)
    y_pred = lgbm3.predict(X_tst_con)
    acc =  metrics.accuracy_score(y_pred, y_tst)
    avg_concat += acc
print("doc2vec:", avg_doc2vec / k)
print("topic:", avg_topic / k)
print("concat:", avg_concat / k)

doc2vec: 0.9442953020134228
topic: 0.9322147651006711
concat: 0.963758389261745


как можно заметить, конкатенация эмбедингов тематической модели и doc2vec дала наилучший результат.