In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/MyDrive/emd/* .

In [None]:
!pip install pytorch-lightning comet-ml transformers dataset sentencepiece

Collecting pytorch-lightning
[?25l  Downloading https://files.pythonhosted.org/packages/e4/11/d5076df8e768662748bc8fe86bc9206e4e900c1784372fcd989c74cde1b3/pytorch_lightning-1.1.3-py3-none-any.whl (680kB)
[K     |████████████████████████████████| 686kB 13.3MB/s 
[?25hCollecting comet-ml
[?25l  Downloading https://files.pythonhosted.org/packages/07/6b/8a64dbc9ce8475abfee0e558744263843c93d9fa81b12899638534071506/comet_ml-3.2.10-py2.py3-none-any.whl (241kB)
[K     |████████████████████████████████| 245kB 52.4MB/s 
[?25hCollecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 50.2MB/s 
[?25hCollecting dataset
  Downloading https://files.pythonhosted.org/packages/fe/7c/e8b9d921bcbcfb192b673eb6eb776fb9110d9675dab41886b89ca8bca6ff/dataset-1.4.3-py2.py3-none-any.whl
Collecting sentencepiece
[?25l  Downl

In [None]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['COMET_DISABLE_AUTO_LOGGING'] = '1'

In [None]:
from configparser import ConfigParser
from pathlib import Path

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from easydict import EasyDict
from collections import Counter
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor, EarlyStopping
from pytorch_lightning.loggers import CometLogger
from sklearn.metrics import f1_score, accuracy_score
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau, MultiplicativeLR
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, AlbertTokenizer, AlbertForSequenceClassification

comet_ml is installed but `COMET_API_KEY` is not set.


In [None]:
class DatasetModule(pl.LightningDataModule):

    def __init__(self, data_dir: str, tokenizer, batch_size=32, max_len=128, cutoff=None):
        super().__init__()
        self.data_dir: Path = Path(data_dir)
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_length = max_len
        self.train_df, self.val_df = None, None
        self.cutoff = cutoff

    def prepare_data(self, *args, **kwargs):
        train_df = pd.read_csv(str(self.data_dir / "train.csv"))
        val_df = pd.read_csv(str(self.data_dir / "test.csv"))

        self.train_df = self._preprocess_df(train_df)
        self.val_df = self._preprocess_df(val_df)

        self.train_df = self.df_undersampling(self.train_df)
        # pd.set_option('display.max_columns', 500)
        print(self.train_df.head())

    def train_dataloader(self):
        return DataLoader(SentimentDataset(self.train_df, self.tokenizer, self.max_length), num_workers=8, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(SentimentDataset(self.val_df, self.tokenizer, self.max_length), num_workers=8, batch_size=self.batch_size, shuffle=True)

    def _preprocess_df(self, df):
        if self.cutoff:
            df = df.head(self.cutoff)
        df['score'] = df['score'].astype(int) - 1
        return df[['reviewText', 'score']]

    def df_undersampling(self, df):
        df['score'] = df['score'].astype(int)
        min_quantity = Counter(df['score']).most_common()[-1][1]
        result_df = pd.DataFrame([])
        for i in df['score'].unique():
            result_df = pd.concat([result_df, df[df['score'] == i].sample(min_quantity)])
        result_df = result_df.sample(frac=1).reset_index(drop=True)
        return result_df

class SentimentDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encode = self.tokenizer(row['reviewText'], add_special_tokens=True, padding='max_length', max_length=self.max_length, truncation=True)
        item = {key: torch.tensor(val).long() for key, val in encode.items()}
        item['labels'] = torch.tensor(row['score'])
        return item


class LitModule(pl.LightningModule):

    def __init__(self, model, freeze, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model = model

        if freeze > 0:
            for name, param in self.model.base_model.embeddings.named_parameters():
                if 'classifier' not in name:
                    param.requires_grad = False

            encoder_layers = self.model.base_model.encoder.layers

            layers_size = len(encoder_layers)
            freeze_layers = int(layers_size * freeze)
            print(f'Freeze {freeze_layers}/{layers_size}')

            for name, param in encoder_layers[:freeze_layers].named_parameters():
                if 'classifier' not in name:
                    param.requires_grad = False

        train_params = sum([np.prod(p.size()) for p in filter(lambda p: p.requires_grad, self.model.parameters())])
        all_params = sum([np.prod(p.size()) for p in self.model.parameters()])
        print(f'Train {train_params / all_params:.4%} params')

    def forward(self, *args, **kwargs):
        pred = self.model(*args, **kwargs)
        return pred

    def training_step(self, batch, batch_nb):
        outputs = self(
            batch['input_ids'],
            token_type_ids=None,
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        self.log('train_loss', loss.item(), logger=True, on_step=False, on_epoch=True)
        self.log('train_loss2', loss.item(), logger=True, on_step=True, on_epoch=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_nb):
        outputs = self(
            batch['input_ids'],
            token_type_ids=None,
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        self.log('val_loss', loss.item(), logger=True, on_step=True, on_epoch=True)

        logits = outputs.logits.detach().cpu().numpy()
        y_pred = np.argmax(logits, axis=-1).astype(int)
        y_true = batch['labels'].to('cpu').numpy().astype(int)
        self.log('f1', f1_score(y_true, y_pred, average='macro'), logger=True)

        return y_true, y_pred

    def validation_epoch_end(self, outs):
        y_true, y_pred = list(), list()
        for y_true_batch, y_pred_batch in outs:
            y_true.extend(list(y_true_batch))
            y_pred.extend(list(y_pred_batch))

        self.log('acc', accuracy_score(y_true, y_pred), logger=True)
        self.log('f1_cum', f1_score(y_true, y_pred, average='macro'), logger=True)

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)
        # lmbda = lambda epoch: 0.4
        # scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)
        # scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
        return {
            'optimizer': optimizer,
            # 'lr_scheduler': scheduler,
            # 'monitor': 'val_loss'
        }

In [None]:
params = EasyDict({
    'name': 'bert',
    'data_path': '.',
    'logger': True,
    'epochs': 3,
    'freeze': 0,
    'seed': 0,
    'batch_size': 32,
    'data_cutoff': None,
    'fast_dev_run': False,
})
seed_everything(params.seed)

config = ConfigParser()
config.read('config.ini')

logger, callbacks = False, list()
if params.logger:
    comet_config = EasyDict(config['cometml'])
    logger = CometLogger(api_key=comet_config.apikey, project_name=comet_config.projectname,
                         workspace=comet_config.workspace)
    logger.log_hyperparams(params)
    callbacks.append(LearningRateMonitor(logging_interval='epoch'))

model_checkpoint = ModelCheckpoint(filepath='checkpoints/{epoch:02d}-{val_loss:.4f}-{f1_cum:.4f}', save_weights_only=True,
                                   save_top_k=3, monitor='f1_cum', mode='max', period=1)
early_stop_callback = EarlyStopping(monitor='f1_cum', mode='max', min_delta=0.01, patience=7, verbose=True)
callbacks.extend([model_checkpoint, early_stop_callback])

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True)
model_backbone = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5,
                                                               output_attentions=False, output_hidden_states=False)

data_module = DatasetModule(data_dir=params.data_path, tokenizer=tokenizer, batch_size=params.batch_size,
                            cutoff=params.data_cutoff)
model = LitModule(model=model_backbone, freeze=params.freeze)

trainer = Trainer(logger=logger, max_epochs=params['epochs'], callbacks=callbacks, gpus=1, deterministic=True, precision=32, fast_dev_run=params.fast_dev_run)
trainer.fit(model, datamodule=data_module)

if params.logger:
    for absolute_path in model_checkpoint.best_k_models.keys():
        logger.experiment.log_model(Path(absolute_path).name, absolute_path)
    if model_checkpoint.best_model_score:
        logger.log_metrics({'best_model_score': model_checkpoint.best_model_score.tolist()})


CometLogger will be initialized in online mode
COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/plutasnyy/emd2/92698dc0b61d494bbac599c18a182165
COMET INFO:   Parameters:
COMET INFO:     batch_size   : 32
COMET INFO:     data_cutoff  : None
COMET INFO:     data_path    : .
COMET INFO:     epochs       : 3
COMET INFO:     fast_dev_run : 1
COMET INFO:     freeze       : 1
COMET INFO:     logger       : True
COMET INFO:     name         : bert
COMET INFO:     seed         : 1
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     os packages         : 1
COMET INFO: ---------------------------
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/plutasnyy/emd2/d81ecc7836c4448a890979d5426

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Train 100.0000% params


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 109 M 
--------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params


                                          reviewText  score
0  I brought this game because I like playing Sol...      3
1  The gameplay is good, the graphics are great, ...      3
2  Again I thought this would be like the regular...      1
3  well j tried to get on it and it says that my ...      1
4  I love how everytime you want to play you can....      4


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/plutasnyy/emd2/d81ecc7836c4448a890979d542623055
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     acc [3]                      : (0.628034188034188, 0.6626540710751238)
COMET INFO:     f1 [3]                       : (0.5328658223152161, 0.5465971827507019)
COMET INFO:     f1_cum [3]                   : (0.5633245481004184, 0.5784713353371609)
COMET INFO:     train_loss [3]               : (0.7586528658866882, 0.9992276430130005)
COMET INFO:     train_loss2_epoch [3]        : (0.7586528658866882, 0.9992276430130005)
COMET INFO:     train_loss2_step [284]       : (0.48429933190345764, 1.4310840368270874)
COMET INFO:     val_loss_epoch [3]           : (0.8129626512527466, 0.9573112726211548)
COMET INFO:     val_loss_step/epoch_0 [1737] : (0.4




COMET INFO: Uploading stats to Comet before program termination (may take several seconds)
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/plutasnyy/emd2/d81ecc7836c4448a890979d542623055

