---
title: ""
author: "Yurkov Sergey"
date: "2024-01-01"
categories: [python]
format: html
draft: true
---


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

# import tensorflow_text as text

# dir(text)

print(tf.__version__)

In [None]:
df = pd.read_csv("archive/labeled.csv", dtype={"toxic": np.int8})

df


In [None]:
%%capture
# !pip install imbalanced-learn

In [None]:
# Create data sets for defaults and non-defaults
nondefaults = df[df["toxic"] == 0]
defaults = df[df["toxic"] == 1]

# Undersample the non-defaults
nondefaults_under = nondefaults.sample(len(defaults))

# Concatenate the undersampled nondefaults with defaults
df_balanced = pd.concat(
    [nondefaults_under.reset_index(drop=True), defaults.reset_index(drop=True)], axis=0
)

# Print the value counts for loan status
print(df_balanced["toxic"].value_counts())


In [None]:
df_balanced.sample(5)

In [None]:
from sklearn.model_selection import train_test_split

X = df_balanced["comment"]
y = df_balanced["toxic"]

y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

y_train.value_counts()


In [None]:
mbert = "https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4"
mbert_preprocess = "https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3"


In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
    preprocessing_layer = hub.KerasLayer(mbert_preprocess, name="preprocessing")
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(mbert, trainable=False, name="mBERT_encoder")
    outputs = encoder(encoder_inputs)
    net = outputs["pooled_output"]
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(
        1, dtype="float16", activation="sigmoid", name="classifier"
    )(net)

    return tf.keras.Model(text_input, net)


In [None]:
classifier_model = build_classifier_model()

classifier_model.summary()


In [None]:
mbert_raw_result = classifier_model(X_test.sample(1))

mbert_raw_result


In [None]:
# !pip install pydot graphviz

# tf.keras.utils.plot_model(classifier_model)


In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)  # from_logits=True
metrics = tf.metrics.BinaryAccuracy()

# epochs = 5  # for full training on Colab only
epochs = 1

# steps_per_epoch = tf.data.experimental.cardinality(X_train).numpy()
steps_per_epoch = X_train.shape[0]
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1 * num_train_steps)
init_lr = 3e-5

# for full training on Colab only
# optimizer = optimization.create_optimizer(
#     init_lr=init_lr,
#     num_train_steps=num_train_steps,
#     num_warmup_steps=num_warmup_steps,
#     optimizer_type="adamw",
# )
optimizer = "adam"


In [None]:
classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)


In [None]:
history = classifier_model.fit(
    x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=epochs
)
