In [1]:
import os
import sys
from collections import Counter
from pathlib import Path
from typing import Any, Callable, Optional, Union

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn

from torch import Tensor
from torch.utils.data import DataLoader
from torchmetrics import F1Score, Accuracy
from transformers import BertTokenizer, BertForSequenceClassification, get_cosine_schedule_with_warmup, AdamW
from lightning.pytorch.utilities.types import (EVAL_DATALOADERS, STEP_OUTPUT,
                                               TRAIN_DATALOADERS)
import lightning as L

from sklearn.model_selection import train_test_split
import plotly.figure_factory as ff
from matplotlib import pyplot as plt
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    accuracy_score,
    multilabel_confusion_matrix,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    precision_score,
    recall_score,
)

import re

import nltk
from nltk.corpus import stopwords
import string
from lightning.pytorch.loggers import TensorBoardLogger

In [2]:
MODEL_ID = '4_lightning'
TEST_SIZE = 0.2
VAL_SIZE = 0.2

In [3]:
df = pd.read_csv(r"C:\Users\vsevo\MKN\skillbox_nlp-vsevolod-lavrov\data\practice_cleaned.csv")
df

Unnamed: 0,Направление,Факультет,ID студента,Оценка,Категория,Тег,Комментарий,Статус
0,C,113.0,1493.0,1.0,Видео,VP2,Видео лагает,
1,C,113.0,5580.0,5.0,ДЗ,H3 D,Торгом Бабаян! Спасибо вам большое за помощь в...,
2,E,126.0,5619.0,5.0,ДЗ,H3,Спасибо),
3,E,123.0,310.0,3.0,ДЗ,H2 E1,комментарий содержит нерелевантную информацию ...,
4,E,123.0,1913.0,5.0,ДЗ,H3 D,"Жонибек, хочу Вас поблагодарить за ваши советы...",
...,...,...,...,...,...,...,...,...
56124,Z,133.0,,3.0,ДЗ,H2,требуемый формат иконок платный,
56125,Z,,,0.0,,S1,заплатила и дальше просто никому нет дела поче...,
56126,Z,,,7.0,,LMS,Крайне раздражают некоторые детали)\nНапример ...,
56127,Z,,,,,VC2 VP2,321.Профессия Бизнес-аналитик\nАналитик данных...,


In [4]:
def process_data(df):
    df = df[['Категория', 'Комментарий']].dropna()
    rename = {
        'Категория': 'category',
        'Комментарий': 'text'
    }
    df = df.rename(columns=rename)
    return df



In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        #df = process_data(df)

        CLASSES = list(df['category'].unique())

        labels = dict(zip(CLASSES, range(len(CLASSES))))
       
        self.labels = [labels[label] for label in df['category']]
   
            
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
      
        return len(self.labels)

    def __getitem__(self, idx):
        batch_texts = self.texts[idx]
        batch_y = np.array(self.labels[idx])
        return batch_texts, batch_y
       

In [6]:
def remove_emoji(inputString):
    emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F" # emoticons
    u"\U0001F300-\U0001F5FF" # symbols & pictographs
    u"\U0001F680-\U0001F6FF" # transport & map symbols
    u"\U0001F1E0-\U0001F1FF" # flags (iOS)
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u'\U00010000-\U0010ffff'
    u"\u200d"
    u"\u2640-\u2642"
    u"\u2600-\u2B55"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\u3030"
    u"\ufe0f"
    u"\u2069"
    u"\u2066"
    u"\u200c"
    u"\u2068"
    u"\u2067"
    "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', inputString)

In [7]:
nltk.download("stopwords")
def remove_rus_stopwords_func(text):
    '''
    Removes Stop Words (also capitalized) from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without Stop Words
    ''' 
    

   
    # check in lowercase 
    t = [token for token in text.split() if not token in set(stopwords.words("russian"))]
    text = ' '.join(t)    
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vsevo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
_collate_fn_t = Callable[[list[tuple[Tensor, Any]]], Any]


class Datamodule(L.LightningDataModule):
    def __init__(
        self,
        datadir: Path,
        tokenizer_path: Path,
        batch_size: int,
        num_workers: int = 0,
    ):
        super().__init__()
        self.datadir = datadir
        self.batch_size = batch_size
        self.num_workers = num_workers

        self.test_size = 0.2
        self.val_size = 0.2

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    def prepare_data(self) -> None:
        df = pd.read_csv(self.datadir)

        df = df[(df['Категория'] != "Качество материалов") & (df['Категория'] != "Интерфейс платформы") & (df['Категория'] != "Общение с куратором")]
    
        df = df[['Категория', 'Комментарий']].dropna()

        df['Комментарий'] = df['Комментарий'].apply(lambda text: remove_rus_stopwords_func(text))

        df['Комментарий'] = df['Комментарий'].apply(lambda text: remove_emoji(text))

        df = df[df.Комментарий.apply(lambda x: len(x.split())) > 1]

        df.drop_duplicates(inplace=True, subset=['Комментарий'])
        
        rename = {
            'Категория': 'category',
            'Комментарий': 'text'
        }
        df = df.rename(columns=rename)
        
        self.train, self.test = train_test_split(df, test_size=self.test_size, random_state=1337)
        self.train, self.val = train_test_split(self.train, test_size=self.val_size, random_state=1337)
    
    @property
    def collate_fn(self) -> _collate_fn_t | None:
        return lambda batch: tuple(zip(*batch))
        
    def setup(self, stage: str) -> None:

        if stage == "fit":
            self.train_dataset = CustomDataset(
                self.train,
                self.tokenizer
            )
            self.val_dataset = CustomDataset(
                self.val,
                self.tokenizer
            )
   
        elif stage == "validate":
            self.val_dataset = CustomDataset(
                self.val,
                self.tokenizer
            )
        elif stage == "test":
            self.test_dataset = CustomDataset(
                self.test,
                self.tokenizer
            )
        else:
            raise NotImplementedError


    def train_dataloader(self) -> TRAIN_DATALOADERS:
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
        )

    def val_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
        )

    def test_dataloader(self) -> EVAL_DATALOADERS:
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
     
        )

In [9]:
# dm = Datamodule("../data/practice_cleaned.csv", 'cointegrated/rubert-tiny', 64)

In [10]:
# dm.prepare_data()
# dm.setup("test")
# for batch in dm.test_dataset:
#     print(batch)

In [11]:
class Lit(L.LightningModule):
    def __init__(self, learning_rate, model_path, tokenizer_path, n_classes=4) -> None:
        super().__init__()
        self.save_hyperparameters()
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.max_len = 512
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
        self.model = self.model
        self.loss = torch.nn.CrossEntropyLoss()
        self.learning_rate = learning_rate

    def training_step(
        self, batch: tuple[list[Tensor], list[dict[str, Tensor]]], batch_idx: int
    ):
        self.model.train()

        train_input, train_label = batch
       
       
        mask = train_input['attention_mask']
        input_id = train_input['input_ids'].squeeze(1)
        output = self.model(input_id, mask)

        loss = self.loss(output[0], train_label.long())

        self.log("train_loss", loss, on_epoch=True, on_step=False)
        
        return loss

    def validation_step(
        self, batch: tuple[Tensor, Tensor], batch_idx: int
    ):
       
        train_input, train_label = batch
      
        
        
        
        mask = train_input['attention_mask']
        input_id = train_input['input_ids'].squeeze(1)
        output = self.model(input_id, mask)

        # print(train_label.shape, output[0].shape)
        f1 = F1Score(task="multiclass", num_classes=4, average='macro').to('cuda')
        
        score = f1(output[0], train_label).to('cuda')

        self.log('val_f1_score', score)
        
        return {
            "f1_score": score,
        }
    
    def test_step(
        self, batch: tuple[Tensor, Tensor], batch_idx: int
    ):
   
        train_input, train_label = batch
      
        mask = train_input['attention_mask']
        input_id = train_input['input_ids'].squeeze(1)
        output = self.model(input_id, mask)
       
        score = f1_score(train_label, output, average='macro')        

        self.log('test_f1_score', score)
        
        return {
            "f1_score": score,
        }

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        
        return {
            "optimizer": optimizer,
            "lr_scheduler": torch.optim.lr_scheduler.MultiStepLR(
                optimizer, milestones=[5, 10, 15]
            )
        }

In [12]:
logger = TensorBoardLogger(".")

In [13]:
lit_module = Lit(
    learning_rate=0.001,
    model_path='cointegrated/rubert-tiny',
    tokenizer_path='cointegrated/rubert-tiny'
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
trainer = L.Trainer(
    accelerator="gpu",
    max_epochs=4,
    logger=logger,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
datamodule = Datamodule(
    Path("../data/practice_cleaned.csv"),
    'cointegrated/rubert-tiny',
    batch_size=8
)

In [16]:
model = BertForSequenceClassification.from_pretrained('cointegrated/rubert-tiny')
out_features = model.bert.encoder.layer[1].output.dense.out_features
model.classifier = torch.nn.Linear(out_features, 4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:

# for batch in datamodule.train_dataloader():
#     train_input, train_label = batch

#     mask = train_input['attention_mask']
#     input_id = train_input['input_ids'].squeeze(1)
#     output = model(input_id, mask)

#     # print(train_label.shape, output[0].shape)
#     f1 = F1Score(task="multiclass", num_classes=4, average='macro')
    
#     score = f1(output[0], train_label)  
#     print(score)

In [18]:
trainer.fit(
    model=lit_module,
    datamodule=datamodule,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 11.8 M
1 | loss  | CrossEntropyLoss              | 0     
--------------------------------------------------------
11.8 M    Trainable params
0         Non-trainable params
11.8 M    Total params
47.142    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\vsevo\MKN\skillbox-practice-vsevolod-lavrov\.pixi\env\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\vsevo\MKN\skillbox-practice-vsevolod-lavrov\.pixi\env\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=4` reached.


In [19]:
def predict(logits: torch.Tensor) -> np.ndarray:
    """Helper function for predictions calculating.

    Args:
        logits (torch.Tensor): model's raw output

    Returns:
        np.ndarray: array with predicted class id.
    """
    s = torch.nn.Softmax()
    probs = s(torch.tensor(logits))
    return np.argmax(probs)

In [20]:
datamodule.setup(stage="test")

preds_logits = torch.tensor([])
targets = torch.tensor([])

with torch.no_grad():
    for val_input, val_label in tqdm(datamodule.test_dataloader()):
        mask = val_input['attention_mask']
        input_id = val_input['input_ids'].squeeze(1)
        output = lit_module.model(input_id, mask)[0]
        preds_logits = torch.cat((preds_logits, output))
        targets = torch.cat((targets, val_label.long().cpu()))



100%|██████████| 1079/1079 [22:35<00:00,  1.26s/it]


In [21]:

preds = np.apply_along_axis(predict, 1, preds_logits)
preds

  return self._call_impl(*args, **kwargs)


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
CLASSES = [0, 1, 2, 3]

In [23]:
targets

tensor([0., 0., 1.,  ..., 1., 0., 1.])

In [24]:
preds

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
cr = classification_report(targets, preds, target_names=CLASSES, output_dict=True)
cr = pd.DataFrame(cr).T
print(cr)

              precision    recall  f1-score      support
0              0.440797  0.988433  0.609697  3804.000000
1              0.520408  0.012213  0.023865  4176.000000
2              0.000000  0.000000  0.000000   262.000000
3              0.000000  0.000000  0.000000   386.000000
accuracy       0.441701  0.441701  0.441701     0.441701
macro avg      0.240301  0.250161  0.158391  8628.000000
weighted avg   0.446224  0.441701  0.280360  8628.000000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
# trainer.test(
#     model=lit_module,
#     datamodule=datamodule,
# )