In [None]:
!pip install tensorflow
!pip install keras-nlp
!pip install seaborn

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import keras_nlp
import seaborn as sns
import matplotlib.pyplot as plt
import os

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)

In [None]:
strategy = tf.distribute.MirroredStrategy()

print("replicas:", strategy.num_replicas_in_sync)

In [None]:
print("Num replicas:", strategy.num_replicas_in_sync)

print("Available devices:")
for d in tf.config.list_physical_devices():
    print(d)

In [None]:
DATA_DIR = '/kaggle/input/contradictory-my-dear-watson/'

RESULT_DICT = {
    0 : "entailment",
    1 : "neutral",
    2 : "contradiction"
}

for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv(DATA_DIR + "train.csv")
df_train.head()

In [None]:
df_test = pd.read_csv(DATA_DIR + "test.csv")
df_test.head()

In [None]:
def display_pair_of_sentence(x):
    print( "Premise : " + x['premise'])
    print( "Hypothesis: " + x['hypothesis'])
    print( "Language: " + x['language'])
    print( "Label: " + str(x['label']))
    print()

df_train.head(10).apply(lambda x : display_pair_of_sentence(x), axis=1)

df_train.shape

In [None]:
f, ax = plt.subplots(figsize=(12, 4))

sns.set_color_codes("pastel")
sns.despine()
ax = sns.countplot(data=df_train,
                   y="label",
                   order = df_train['label'].value_counts().index)

abs_values = df_train['label'].value_counts(ascending=False)
rel_values = df_train['label'].value_counts(ascending=False, normalize=True).values * 100
lbls = [f'{p[0]} ({p[1]:.0f}%)' for p in zip(abs_values, rel_values)]

ax.bar_label(container=ax.containers[0], labels=lbls)

ax.set_yticklabels([RESULT_DICT[index] for index in abs_values.index])

ax.set_title("Distribution of labels in the training set")

In [None]:
f, ax = plt.subplots(figsize=(10, 10))

sns.set_color_codes("pastel")
sns.despine()
ax = sns.countplot(data=df_train,
                   y="language",
                   order = df_train['language'].value_counts().index)

abs_values = df_train['language'].value_counts(ascending=False)
rel_values = df_train['language'].value_counts(ascending=False, normalize=True).values * 100
lbls = [f'{p[0]} ({p[1]:.0f}%)' for p in zip(abs_values, rel_values)]

ax.bar_label(container=ax.containers[0], labels=lbls)

ax.set_title("Distribution of languages in the training set")

In [None]:
df_train["premise_length"] = df_train["premise"].apply(lambda x : len(x))
df_train["hypothesis_length"] = df_train["hypothesis"].apply(lambda x : len(x))
df_train[["hypothesis_length", "premise_length"]].describe()

In [None]:
VALIDATION_SPLIT = 0.3
TRAIN_SIZE = int(df_train.shape[0]*(1-VALIDATION_SPLIT))
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [None]:
def split_labels(x, y):
    return (x[0], x[1]), y

training_dataset = tf.data.Dataset.from_tensor_slices(
    (
        df_train[['premise', 'hypothesis']].values,
        keras.utils.to_categorical(df_train['label'], num_classes=3).astype('float32')
    )
)

train_dataset = training_dataset.take(TRAIN_SIZE)
val_dataset = training_dataset.skip(TRAIN_SIZE)

train_preprocessed = train_dataset.map(split_labels, tf.data.AUTOTUNE)
        .batch(BATCH_SIZE, drop_remainder=True)
        .cache()
        .prefetch(tf.data.AUTOTUNE)

val_preprocessed = val_dataset.map(split_labels, tf.data.AUTOTUNE)
        .batch(BATCH_SIZE, drop_remainder=True)
        .cache()
        .prefetch(tf.data.AUTOTUNE)

In [None]:
print("TRAIN SPEC:", train_preprocessed.element_spec)
print("VAL   SPEC:", val_preprocessed.element_spec)

In [None]:
for (prem, hyp), labels in train_preprocessed.take(1):
    print("Premise sample :", prem[0].numpy()[:80], "…")
    print("Hypothesis sample:", hyp[0].numpy()[:80], "…")
    print("Labels dtype    :", labels.dtype)      # должно быть float32
    print("Labels shape    :", labels.shape)      # (batch, 3)
    print("One-hot vector  :", labels[0].numpy()) # например [0. 1. 0.]


In [None]:
from tensorflow.keras import layers, optimizers, losses, metrics

with strategy.scope():
    classifier = keras_nlp.models.BertClassifier.from_preset("bert_base_multi", num_classes=3)

    classifier.compile(optimizers.Adam(2e-5),
                       losses.CategoricalCrossentropy(from_logits=True),
                       metrics=[metrics.CategoricalAccuracy()]
                      )

    classifier.summary()

In [None]:
EPOCHS=3
history = classifier.fit(train_preprocessed,
                         epochs=EPOCHS,
                         validation_data=val_preprocessed
                        )

In [None]:
predictions = classifier.predict((df_test['premise'],df_test['hypothesis']), batch_size=BATCH_SIZE)

In [None]:
submission = df_test.id.copy().to_frame()
submission["prediction"] = np.argmax(predictions, axis=1)

submission

In [None]:
submission.to_csv("submission.csv", index=False)