Построить модель для выявления в новости события, соответствующего задержке ввода некоторого объекта в эксплуатацию.

###EDA

In [1]:
%%capture
!wget https://raw.githubusercontent.com/shitkov/news_classification/main/news.zip
!unzip '/content/news.zip' -d '/content/'

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('/content/train_data.csv', index_col=0)

In [4]:
# баланс классов
print('1: ', len(data[data['label'] == 1]))
print('0: ', len(data[data['label'] == 0]))

1:  329
0:  1340


In [5]:
# чистка данных
import re
texts = list(data['sentence'])
labels = list(data['label'])
# убрать лишние символы
texts = [re.sub('[^а-яё ]', ' ', str(t).lower()) for t in texts]
# убрать лишние пробелы
texts = [re.sub(r" +", " ", t).strip() for t in texts]
data['sentence'] = texts

In [6]:
# train-test split
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(data, test_size=0.3, random_state=42)
labels_train = list(data_train['label'])
labels_test = list(data_test['label'])

### Vectorization

TF-IDF

In [7]:
# установка лемматизатора
%%capture
!pip install pymystem3
from pymystem3 import Mystem
mstm = Mystem()
!wget http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
!tar -xvf mystem-3.0-linux3.1-64bit.tar.gz
!cp mystem /root/.local/bin/mystem

In [8]:
# лемматизация
mstm = Mystem()

texts_train = list(data_train['sentence'])
texts_test = list(data_test['sentence'])

texts_train_norm = [''.join(mstm.lemmatize(t)[:-1]) for t in texts_train]
texts_test_norm = [''.join(mstm.lemmatize(t)[:-1]) for t in texts_test]

In [9]:
# стоп-слова
%%capture
!wget https://raw.githubusercontent.com/shitkov/news_classification/main/rus_stop_dict.txt

In [10]:
path_stop = '/content/rus_stop_dict.txt'

try:
    with open(path_stop) as f:
        stopwords = [line.rstrip('\n') for line in f]
except Exception as err:
    print(err)

In [11]:
# удаление стоп-слов
def drop_stop(text):
    tokens = text.split(' ')
    tokens = [t for t in tokens if t not in stopwords]
    return ' '.join(tokens)

In [12]:
texts_train_norm_clean = [drop_stop(t) for t in texts_train_norm]
texts_test_norm_clean = [drop_stop(t) for t in texts_test_norm]

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
model_tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = model_tfidf.fit_transform(texts_train_norm_clean)
X_test_tfidf = model_tfidf.transform(texts_test_norm_clean)

FastText

In [14]:
%%capture
!pip install gensim==3.8.1
!pip install compress-fasttext[full]

import gensim
import compress_fasttext
import numpy as np
model_fasttext = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
    'https://github.com/avidale/compress-fasttext/releases/download/v0.0.1/ft_freqprune_400K_100K_pq_300.bin'
)

X_train_fasttext = np.asarray([model_fasttext[text] for text in texts_train])
X_test_fasttext = np.asarray([model_fasttext[text] for text in texts_test])

USE

In [15]:
!pip3 install tensorflow_text>=2.0.0rc0

import tensorflow_hub as hub
import numpy as np
import tensorflow_text

model_use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

X_train_use = np.asarray([model_use(text)[0] for text in texts_train])
X_test_use = np.asarray([model_use(text)[0] for text in texts_test])

BERT

In [16]:
%%capture
!pip install transformers

In [17]:
from transformers import BertTokenizer, BertModel
import torch

In [18]:
def bert_emb(sentences, tokenizer, model, device):
    emb_list = []
    for s in sentences:
        encoding = tokenizer(s, add_special_tokens=True, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        input_ids = encoding.input_ids.to(device)
        token_type_ids = encoding.token_type_ids.to(device)
        attention_mask = encoding.attention_mask.to(device)
        
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
                )
        
        embeddings = outputs.last_hidden_state[:,0,:].cpu().detach().numpy()
        emb_list.append(embeddings[0])
    return np.array(emb_list)

In [19]:
%%capture
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model_bert = BertModel.from_pretrained("bert-base-multilingual-cased")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_bert.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
X_train_bert = bert_emb(texts_train, tokenizer_bert, model_bert, device)
X_test_bert = bert_emb(texts_test, tokenizer_bert, model_bert, device)

In [21]:
data_dict = {
    'tfidf': (X_train_tfidf, X_test_tfidf),
    'fasttext': (X_train_fasttext, X_test_fasttext),
    'use': (X_train_use, X_test_use),
    'bert': (X_train_bert, X_test_bert)}

### Classification

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

class Classifier:

    def __init__(self, classifier='logreg'):
        if classifier == 'random_forest':
             self.cls = RandomForestClassifier(random_state=42)
        elif classifier == 'xgboost':
             self.cls = XGBClassifier(random_state=42, )
        else:
            self.cls = LogisticRegression(
                random_state=42,
                max_iter=10000,
                class_weight={0:1.0, 1:2.0}
                )

    def predict(self, x_train, x_test, y_train, y_test):
        self.cls.fit(x_train, y_train)
        predictions = self.cls.predict(x_test)
        return f1_score(predictions, y_test)

In [23]:
cls_list = ['logreg', 'random_forest', 'xgboost']

In [24]:
results = []
for emb in data_dict.keys():
    x_train, x_test = data_dict[emb]
    for cls_name in cls_list:
        score = Classifier(cls_name).predict(x_train, x_test, labels_train, labels_test)
        results.append((emb + '_' + cls_name, score))

results = sorted(results, key=lambda x: x[1], reverse=True)
print('BEST: ', results[0])

BEST:  ('tfidf_xgboost', 0.650887573964497)
