In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import joblib
import keras
import keras_nlp



2024-03-31 22:22:38.899912: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 22:22:38.900048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 22:22:39.027987: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:

#load clean data
full_train_df = pd.read_csv("/kaggle/input/cleaned-nlp-tweets/cleaned_train.csv")
full_train_df.dropna(inplace=True)


print("Train Data shape :", full_train_df.shape)
full_train_df.head()

Train Data shape : (7610, 3)


Unnamed: 0,id,text_cleaned,target
0,1,deeds reason may allah forgive us,1
1,4,forest fire near la ronge sask canada,1
2,5,residents asked shelter place notified officer...,1
3,6,people receive evacuation orders california,1
4,7,got sent photo ruby smoke pours school,1


In [4]:
full_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7610 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            7610 non-null   int64 
 1   text_cleaned  7610 non-null   object
 2   target        7610 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 237.8+ KB


### Splitting data

In [5]:
shuffled_train_df = full_train_df.sample(frac=1,
                                    random_state=42,
                                    replace=False)



train_data_split_df = shuffled_train_df[ : 6000]
val_data_split_df = shuffled_train_df[6000 : 6800]
test_data_split_df = shuffled_train_df[6800: ]

train_split_ids = train_data_split_df["id"]
val_split_ids = val_data_split_df["id"]
test_split_ids = val_data_split_df["id"]


print("Train  split shape: ", train_data_split_df.shape)
print("Validation split shape: ", val_data_split_df.shape)
print("Test  split shape: ", test_data_split_df.shape)

Train  split shape:  (6000, 3)
Validation split shape:  (800, 3)
Test  split shape:  (810, 3)


### Establish Baseline: TF-iDF + Logistic Regression

In [6]:
TF_MAX_FEATURES = 2000
TF_MAX_DF=0.93
TF_MIN_DF=0.00001


In [7]:

tfidf_vectorizer = TfidfVectorizer(
                             max_features=TF_MAX_FEATURES,
                             max_df=TF_MAX_DF,
                             min_df=TF_MIN_DF,
                             )


In [8]:
tfidf_vectorizer.fit(train_data_split_df["text_cleaned"])

In [9]:
train_tfidf_matrix  = tfidf_vectorizer.transform(train_data_split_df["text_cleaned"]).toarray()
val_tfidf_matrix = tfidf_vectorizer.transform(val_data_split_df["text_cleaned"]).toarray()
test_tfidf_matrix = tfidf_vectorizer.transform(test_data_split_df["text_cleaned"]).toarray()

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score



logreg = LogisticRegression(random_state=42)

logreg.fit(train_tfidf_matrix, train_data_split_df["target"])

In [11]:

def evaluate_model(model, train_data, val_data, test_data):
    # Predictions
    train_preds = np.round(model.predict(train_data[0]))
    val_preds = np.round(model.predict(val_data[0]))
    test_preds = np.round(model.predict(test_data[0]))
    
    # F1 scores
    train_f1 = f1_score(train_data[1], train_preds)
    val_f1 = f1_score(val_data[1], val_preds)
    test_f1 = f1_score(test_data[1], test_preds)
    
    # Accuracy scores
    train_acc = accuracy_score(train_data[1], train_preds)
    val_acc = accuracy_score(val_data[1], val_preds)
    test_acc = accuracy_score(test_data[1], test_preds)
    
    # Print results
    print("MODEL: ", str(model))
    print("\n======ACCURACY==========\n")
    print("Train Accuracy Score: ", train_acc)
    print("Validation Accuracy Score: ", val_acc)
    print("Test Accuracy  Score: ", test_acc)

    print("\n======F1-Score==========\n")
    print("Train F1 Score: ", train_f1)
    print("Validation F1 Score: ", val_f1)
    print("Test F1 Score: ", test_f1)


evaluate_model(logreg, 
               (train_tfidf_matrix, train_data_split_df["target"]), 
               (val_tfidf_matrix, val_data_split_df["target"]), 
               (test_tfidf_matrix, test_data_split_df["target"]))


MODEL:  LogisticRegression(random_state=42)


Train Accuracy Score:  0.8565
Validation Accuracy Score:  0.78875
Test Accuracy  Score:  0.782716049382716


Train F1 Score:  0.8154340836012862
Validation F1 Score:  0.7287319422150883
Test F1 Score:  0.7349397590361445


### Saving baseline pipeline

In [13]:
baseline_pipeline = Pipeline([
    ('TF-IDF', tfidf_vectorizer),
    ('Logistic Regression', logreg)
])


joblib.dump(baseline_pipeline, "tfidf_logreg_baseline.joblib")



['tfidf_logreg_baseline.joblib']

### Baseline 2: TF-iDf + Simple Neural Net

In [14]:

baseline_nn_model = tf.keras.Sequential([
        
        tf.keras.layers.Input(shape=(TF_MAX_FEATURES,)),
         tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid"),
       
], name="baseline_shallow_neuralnet")


baseline_nn_model.summary()

In [15]:



baseline_nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.0003), 
                          metrics=["accuracy"],
                          loss="binary_crossentropy")

baseline_nn_history = baseline_nn_model.fit(train_tfidf_matrix, train_data_split_df["target"].values.reshape(-1, 1),
                       batch_size=256,
                       epochs=30,
                       validation_data=(val_tfidf_matrix, val_data_split_df["target"].values.reshape(-1, 1)),
                      )

Epoch 1/30
[1m19/24[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.4738 - loss: 0.6947 

I0000 00:00:1711923798.214196      84 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 140ms/step - accuracy: 0.4803 - loss: 0.6945 - val_accuracy: 0.5675 - val_loss: 0.6915
Epoch 2/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5616 - loss: 0.6917 - val_accuracy: 0.5738 - val_loss: 0.6892
Epoch 3/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5885 - loss: 0.6895 - val_accuracy: 0.5750 - val_loss: 0.6869
Epoch 4/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5850 - loss: 0.6874 - val_accuracy: 0.5750 - val_loss: 0.6843
Epoch 5/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5817 - loss: 0.6854 - val_accuracy: 0.5750 - val_loss: 0.6813
Epoch 6/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5800 - loss: 0.6820 - val_accuracy: 0.5750 - val_loss: 0.6773
Epoch 7/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━

In [16]:

evaluate_model(baseline_nn_model, 
               (train_tfidf_matrix, train_data_split_df["target"]), 
               (val_tfidf_matrix, val_data_split_df["target"]), 
               (test_tfidf_matrix, test_data_split_df["target"]))


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
MODEL:  <Sequential name=baseline_shallow_neuralnet, built=True>


Train Accuracy Score:  0.8558333333333333
Validation Accuracy Score:  0.7775
Test Accuracy  Score:  0.782716049382716


Train F1 Score:  0.8133764832793959
Validation F1 Score:  0.7129032258064516
Test F1 Score:  0.7325227963525835


In [17]:
baseline_nn_model.save("baseline_nn_model.keras")

### Training LSTM 

#### TODO:
     1) train LSTM/GRU models for a while to achive slightly better performance
     2) Train distilBERT a few times and try different versions of BERT
     3) Save results
     4) Merge it all in one notebook!


### DistilledBERT finetuning

In [18]:
X_train, y_train = train_data_split_df["text_cleaned"].values, train_data_split_df["target"].values
X_val, y_val = val_data_split_df["text_cleaned"].values, val_data_split_df["target"].values
X_test, y_test = test_data_split_df["text_cleaned"].values, test_data_split_df["target"].values

In [19]:
PRETRAINED_MODEL =  "distil_bert_base_en_uncased"



preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(PRETRAINED_MODEL,
                                                             sequence_length=150,
                                                             name="bert_preprocessor")



# #output - logits
bert_classifier = keras_nlp.models.DistilBertClassifier.from_preset(PRETRAINED_MODEL,
                                                                    preprocessor=preprocessor,
                                                                      num_classes=1)


Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.txt' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...


In [20]:
bert_classifier.summary()

In [21]:

LOSS = tf.keras.losses.BinaryCrossentropy(from_logits=True)
OPTIMIZER = tf.keras.optimizers.Adam(2e-6)
METRICS = ["accuracy"]


bert_classifier.compile(
    loss=LOSS,
    optimizer=OPTIMIZER,
    metrics=METRICS
)

In [22]:
backup_restore_callback = keras.callbacks.BackupAndRestore(backup_dir="train_backups/")


checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath="train_checkpoints/bert_best_checkpoint.keras",
                                      save_weights_only=False,
                                      save_best_only=True,
                                      monitor='val_accuracy',
                                      mode='max',
                                      verbose=1)

early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss",
                                               patience=2,
                                                verbose=1)

In [23]:
bert_classifier.fit(X_train, y_train, 
               validation_data=(X_val, y_val),
               batch_size=32, epochs=20,
                callbacks=[checkpoint_callback, backup_restore_callback, early_stopping])




Epoch 1/20


W0000 00:00:1711923922.184962      82 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 413ms/step - accuracy: 0.5742 - loss: 0.6767

W0000 00:00:1711923999.564179      81 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1711924002.853322      84 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update



Epoch 1: val_accuracy improved from -inf to 0.75000, saving model to train_checkpoints/bert_best_checkpoint.keras
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 461ms/step - accuracy: 0.5743 - loss: 0.6765 - val_accuracy: 0.7500 - val_loss: 0.5486
Epoch 2/20
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 0.7757 - loss: 0.5129
Epoch 2: val_accuracy improved from 0.75000 to 0.78250, saving model to train_checkpoints/bert_best_checkpoint.keras
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 286ms/step - accuracy: 0.7758 - loss: 0.5128 - val_accuracy: 0.7825 - val_loss: 0.4824
Epoch 3/20
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - accuracy: 0.8150 - loss: 0.4399
Epoch 3: val_accuracy improved from 0.78250 to 0.78500, saving model to train_checkpoints/bert_best_checkpoint.keras
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 255ms/step - accuracy: 0.8150 -

<keras.src.callbacks.history.History at 0x7e29b0472ec0>

In [25]:
bert_classifier.save("distil_bert_tuned.keras")


In [35]:
bert_test_predictions = tf.round(tf.sigmoid(bert_classifier.predict(X_test)))
val_test_predictions = tf.round(tf.sigmoid(bert_classifier.predict(X_val)))


bert_acc_test = accuracy_score(y_test, bert_test_predictions)
bert_f1_test = f1_score(y_test, bert_test_predictions)


print("Distilled BERT finetuned:")
print("TEST ACCURACY: ", bert_acc_test)
print("TEST F1-SCORE: ", bert_f1_test)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 63ms/step
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step
Distilled BERT finetuned:
TEST ACCURACY:  0.8222222222222222
TEST F1-SCORE:  0.8


### Submit predictions

In [38]:
test_df = pd.read_csv("/kaggle/input/cleaned-nlp-tweets/cleaned_test.csv").fillna("missing")

test_df.head()

Unnamed: 0,id,text_cleaned
0,0,happened terrible car crash
1,2,heard different cities stay safe everyone
2,3,forest fire spot pond geese fleeing across str...
3,9,apocalypse lighting
4,11,typhoon soudelor kills china taiwan


In [39]:
test_df["target"] = tf.round(tf.sigmoid(bert_classifier.predict(test_df["text_cleaned"])))


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 79ms/step


In [40]:
test_df.head()

Unnamed: 0,id,text_cleaned,target
0,0,happened terrible car crash,1.0
1,2,heard different cities stay safe everyone,0.0
2,3,forest fire spot pond geese fleeing across str...,1.0
3,9,apocalypse lighting,0.0
4,11,typhoon soudelor kills china taiwan,1.0


In [41]:
test_df.drop("text_cleaned", axis=1, inplace=True)

test_df.to_csv("distillbert_submission.csv", index=False)