In [1]:
import tensorflow as tf

import keras

import numpy as np
import pandas as pd
import wandb
import keras_nlp
import os



os.environ['KERAS_BACKEND'] = 'tensorflow'



2024-03-31 10:44:10.616762: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 10:44:10.616855: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 10:44:10.786459: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Splitting data

In [2]:
full_train_df = pd.read_csv("/kaggle/input/cleaned-tweets-disaster/cleaned_train.csv").dropna()

shuffled_train_df = full_train_df.sample(frac=1,
                                    random_state=42,
                                    replace=False)



train_data_split_df = shuffled_train_df[ : 6000]
val_data_split_df = shuffled_train_df[6000 : 6800]
test_data_split_df = shuffled_train_df[6800: ]

train_split_ids = train_data_split_df["id"]
val_split_ids = val_data_split_df["id"]
test_split_ids = val_data_split_df["id"]


print("Train  split shape: ", train_data_split_df.shape)
print("Validation split shape: ", val_data_split_df.shape)
print("Test  split shape: ", test_data_split_df.shape)

Train  split shape:  (6000, 3)
Validation split shape:  (800, 3)
Test  split shape:  (810, 3)


In [3]:
X_train, y_train = train_data_split_df["text_cleaned"].values, train_data_split_df["target"].values
X_val, y_val = val_data_split_df["text_cleaned"].values, val_data_split_df["target"].values
X_test, y_test = test_data_split_df["text_cleaned"].values, test_data_split_df["target"].values

In [5]:
PRETRAINED_MODEL =  "distil_bert_base_en_uncased"



preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(PRETRAINED_MODEL,
                                                             sequence_length=120,
                                                             name="bert_preprocessor")



# #output - logits
bert_classifier = keras_nlp.models.DistilBertClassifier.from_preset(PRETRAINED_MODEL,
                                                                    preprocessor=preprocessor,
                                                          num_classes=2)


Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.txt' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...


In [6]:
bert_classifier.summary()

In [7]:

LOSS = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
OPTIMIZER = tf.keras.optimizers.Adam(1e-5)
METRICS = ["accuracy"]


bert_classifier.compile(
    loss=LOSS,
    optimizer=OPTIMIZER,
    metrics=METRICS
)

### Preparing callbacks for Training

In [8]:
class WandbLoggerCallback(tf.keras.callbacks.Callback):
   
    def on_batch_end(self, epoch, logs=None):
        wandb.log({"batch_loss" : logs["loss"],
                   "batch_accuracy" : logs["accuracy"]})
        

    def on_epoch_end(self, epoch, logs=None):
        
        wandb.log({"epoch_loss" : logs["loss"],
                   "epoch_accuracy" : logs["accuracy"],
                   "val_loss" : logs["val_loss"],
                   "val_accuracy" : logs["val_accuracy"]})
        print("\nMETRIC LOGGED")


backup_restore_callback = keras.callbacks.BackupAndRestore(backup_dir="train_backups/")


checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath="train_checkpoints/bert_checkpoint_{val_accuracy:.4f}.keras",
                                      save_weights_only=False,
                                      save_best_only=True,
                                      monitor='val_accuracy',
                                      mode='max',
                                      verbose=1)

wandb_logger = WandbLoggerCallback()



        

In [9]:

TRIAL_CONFIG = {"pipeline" :  ["DEBERT_CLASSIFIER",
                                       "LOGITS -> SIGMOID"],
                "pretrained_model" : PRETRAINED_MODEL,
                        
                "train_params" : {
                                    "optimizer" : OPTIMIZER,
                                    "metrics" : METRICS,
                                    "loss" : LOSS,
                                },

                "framework" : "keras_nlp",

                "data_split" : [train_data_split_df.shape,
                                      val_data_split_df.shape,
                                      test_data_split_df.shape]
          }



In [10]:

bert_classifier.fit(X_train, y_train, 
               validation_data=(X_val, y_val),
               batch_size=16, epochs=6,
                callbacks=[checkpoint_callback, backup_restore_callback])




Epoch 1/6


I0000 00:00:1711881972.279381      89 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1711881972.319675      89 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step - accuracy: 0.7553 - loss: 0.5362

W0000 00:00:1711882034.552448      90 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update



Epoch 1: val_accuracy improved from -inf to 0.79750, saving model to train_checkpoints/bert_checkpoint_0.7975.keras
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 184ms/step - accuracy: 0.7554 - loss: 0.5360 - val_accuracy: 0.7975 - val_loss: 0.4514
Epoch 2/6
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step - accuracy: 0.8429 - loss: 0.3896
Epoch 2: val_accuracy did not improve from 0.79750
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 180ms/step - accuracy: 0.8430 - loss: 0.3896 - val_accuracy: 0.7925 - val_loss: 0.4718
Epoch 3/6
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.8753 - loss: 0.3250
Epoch 3: val_accuracy did not improve from 0.79750
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 178ms/step - accuracy: 0.8753 - loss: 0.3250 - val_accuracy: 0.7937 - val_loss: 0.5284
Epoch 4/6
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

<keras.src.callbacks.history.History at 0x7d3d85533130>

In [11]:
best_bert_checkpoint_model = keras.models.load_model("/kaggle/working/train_checkpoints/bert_checkpoint_0.7975.keras")

  instance.compile_from_config(compile_config)
  trackable.load_own_variables(weights_store.get(inner_path))


In [16]:
##loading test data 

test_df = pd.read_csv("/kaggle/input/cleaned-tweets-disaster/cleaned_test.csv").fillna("missing ")


In [17]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            3263 non-null   int64 
 1   text_cleaned  3263 non-null   object
dtypes: int64(1), object(1)
memory usage: 51.1+ KB


In [20]:
test_predictions = tf.argmax(best_bert_checkpoint_model.predict(test_df["text_cleaned"].values), axis=1)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 100ms/step


In [22]:
submission_df = pd.DataFrame({"id" : test_df["id"],
                              "target" : test_predictions})

In [23]:
submission_df.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [29]:
submission_df.to_csv("distill_bert_submission.csv", index=False)

In [28]:
!ls

distill_bert_submission.csv  train_checkpoints
