# Training Model

## Initialization

In [1]:
import pandas as pd
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
# os.environ['CUDA_VISIBLE_DEVICES'] = ''
from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score


## Loading data

In [2]:
df = pd.read_excel("ready_data.xlsx", index_col=0)
labels = ["rejected", "published"]
df.head(3)

Unnamed: 0,comment,status
0,نگاه این روانی تیمارستانی کنید ب بی تی اس میگه...,0
1,این یکی عربه میتونه سیرش کنه خخخخ منظورم رو که...,0
2,دولتی که فسادو رانت خواری تمامش رافراگرفته ازو...,1


In [3]:
# Labels
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'rejected': 0, 'published': 1}
id2label: {0: 'rejected', 1: 'published'}


## Train

### Train Valid Test Split

In [4]:
train, test = train_test_split(df, test_size=0.1, random_state=1, stratify=df['status'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['status'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['comment'].values.tolist(), train['status'].values.tolist()
x_valid, y_valid = valid['comment'].values.tolist(), valid['status'].values.tolist()
x_test, y_test = test['comment'].values.tolist(), test['status'].values.tolist()

print(train.shape)
print(valid.shape)
print(test.shape)

(15329, 2)
(1704, 2)
(1893, 2)


### Model Config

In [5]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
TEST_BATCH_SIZE = 1

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = './model/bert-fa-base-uncased-sentiment-taaghceh/sample_comments.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [6]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "rejected",
    "1": "published"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "published": 1,
    "rejected": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



### Embeddings

In [7]:
class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=128, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in enumerate(zip(x, y)):
        guid = "%s" % i
        label = int(_y)
        
        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]
        
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
    features = glue_convert_examples_to_features(
        examples, 
        tokenizer, 
        maxlen, 
        output_mode=output_mode, 
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in features:
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features
    
    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [8]:
train_dataset_base, train_examples = make_examples(tokenizer, x_train, y_train, maxlen=128)
valid_dataset_base, valid_examples = make_examples(tokenizer, x_valid, y_valid, maxlen=128)

test_dataset_base, test_examples = make_examples(tokenizer, x_test, y_test, maxlen=128)
[xtest, ytest], test_examples = make_examples(tokenizer, x_test, y_test, maxlen=128, is_tf_dataset=False)



In [9]:
for value in train_dataset_base.take(1):
    print(f'     input_ids: {value[0]["input_ids"]}')
    print(f'attention_mask: {value[0]["attention_mask"]}')
    print(f'token_type_ids: {value[0]["token_type_ids"]}')
    print(f'        target: {value[1]}')

     input_ids: [    2  8969  3274  5899 34634  4241  4338  1379  3826  3125  3556  3486
  3274  3660  6853  7693 66773  4313 32196 10652 14114     4     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
attention_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [10]:
def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

In [11]:
train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE)
valid_dataset = get_training_dataset(valid_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(valid_examples) // VALID_BATCH_SIZE

train_steps, valid_steps

(15329, 1704)

### Model

In [12]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [13]:
MODEL_NAME_OR_PATH = "/home/reza/Desktop/aasaam/comment-classification/comment_classification/R&D/model/tf_model.h5"
model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at /home/reza/Desktop/aasaam/comment-classification/comment_classification/R&D/model/tf_model.h5 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training

In [14]:
%%time
r = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    verbose=1)

final_accuracy = r.history['val_accuracy']
print('FINAL ACCURACY MEAN: ', np.mean(final_accuracy))

Epoch 1/3


ResourceExhaustedError: Graph execution error:

Detected at node 'Adam/Adam/update/mul_1' defined at (most recent call last):
    File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/traitlets/config/application.py", line 976, in launch_instance
      app.start()
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2881, in run_cell
      result = self._run_cell(
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2936, in _run_cell
      return runner(coro)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3135, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3338, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_25497/398553755.py", line 1, in <cell line: 1>
      get_ipython().run_cell_magic('time', '', "r = model.fit(\n    train_dataset,\n    validation_data=valid_dataset,\n    steps_per_epoch=train_steps,\n    validation_steps=valid_steps,\n    epochs=EPOCHS,\n    verbose=1)\n\nfinal_accuracy = r.history['val_accuracy']\nprint('FINAL ACCURACY MEAN: ', np.mean(final_accuracy))\n")
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2358, in run_cell_magic
      result = fn(*args, **kwargs)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/IPython/core/magics/execution.py", line 1316, in time
      exec(code, glob, local_ns)
    File "<timed exec>", line 1, in <module>
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/engine/training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/engine/training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/engine/training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/engine/training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/transformers/modeling_tf_utils.py", line 1154, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 539, in minimize
      return self.apply_gradients(grads_and_vars, name=name)
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 678, in apply_gradients
      return tf.__internal__.distribute.interim.maybe_merge_call(
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 723, in _distributed_apply
      update_op = distribution.extended.update(
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 701, in apply_grad_to_update_var
      return self._resource_apply_sparse_duplicate_indices(
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 1326, in _resource_apply_sparse_duplicate_indices
      return self._resource_apply_sparse(summed_grad, handle, unique_indices,
    File "/home/reza/Desktop/aasaam/comment-classification/.venv/lib/python3.8/site-packages/keras/optimizers/optimizer_v2/adam.py", line 206, in _resource_apply_sparse
      m_t = tf.compat.v1.assign(m, m * coefficients['beta_1_t'],
Node: 'Adam/Adam/update/mul_1'
failed to allocate memory
	 [[{{node Adam/Adam/update/mul_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_100016]

In [None]:
# save the model
model.save_pretrained(os.path.dirname(OUTPUT_PATH))

### Evaluation / Prediction

In [None]:
ev = model.evaluate(test_dataset_base.batch(TEST_BATCH_SIZE))
print()
print(f'Evaluation: {ev}')
print()

predictions = model.predict(xtest)
ypred = predictions[0].argmax(axis=-1).tolist()

print()
print(classification_report(ytest, ypred, target_names=labels))
print()

print(f'F1: {f1_score(ytest, ypred, average="weighted")}')


Evaluation: [0.8027344346046448, 0.7485472559928894]


              precision    recall  f1-score   support

    rejected       0.76      0.73      0.74       946
   published       0.74      0.77      0.75       947

    accuracy                           0.75      1893
   macro avg       0.75      0.75      0.75      1893
weighted avg       0.75      0.75      0.75      1893


F1: 0.7484204859467789


In [None]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(ytest, ypred).ravel()
print("\t\tPositive\tNegative")
print(f"Positive\tTP={tp}\t\tFP={fp}")
print(f"Negetive\tFN={fn}\t\tTN={tn}")



		Positive	Negative
Positive	TP=730		FP=259
Negetive	FN=217		TN=687
