In [2]:
!kaggle competitions download -c quora-insincere-questions-classification -p /content/
!unzip -q /content/quora-insincere-questions-classification.zip -d /content/


Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 4, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.12/dist-packages/kaggle/__init__.py", line 6, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 434, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/
unzip:  cannot find or open /content/quora-insincere-questions-classification.zip, /content/quora-insincere-questions-classification.zip.zip or /content/quora-insincere-questions-classification.zip.ZIP.


In [1]:
from google.colab import files
uploaded = files.upload()


Saving test.csv to test.csv
Saving train.csv to train.csv


In [3]:
!pip install -q transformers datasets tensorflow scikit-learn

import numpy as np
import tensorflow as tf
from transformers import BertTokenizerFast, TFBertModel
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# ---------------- CONFIG ----------------
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 1     # quick test
SEED = 42

tf.random.set_seed(SEED)
np.random.seed(SEED)

# ---------------- LOAD DATA ----------------
dataset = load_dataset("imdb")

# Take smaller subset for speed
texts = dataset["train"]["text"][:5000] + dataset["test"]["text"][:2000]
labels = dataset["train"]["label"][:5000] + dataset["test"]["label"][:2000]

print("Total examples:", len(texts))

# Stratified train/val/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=SEED, stratify=labels
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=SEED, stratify=train_labels
)

print("Train:", len(train_texts), "Val:", len(val_texts), "Test:", len(test_texts))

# ---------------- TOKENIZER ----------------
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize(texts, max_len=128):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors="np"
    )

train_enc = tokenize(train_texts, MAX_LEN)
val_enc   = tokenize(val_texts, MAX_LEN)
test_enc  = tokenize(test_texts, MAX_LEN)

# ---------------- DATASETS ----------------
def make_dataset(encodings, labels, batch_size=32, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    if shuffle:
        ds = ds.shuffle(len(labels), seed=SEED)
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_ds = make_dataset(train_enc, train_labels, batch_size=BATCH_SIZE, shuffle=True)
val_ds   = make_dataset(val_enc,   val_labels,   batch_size=BATCH_SIZE)
test_ds  = make_dataset(test_enc,  test_labels,  batch_size=BATCH_SIZE)

# ---------------- BUILD MODEL ----------------
encoder = TFBertModel.from_pretrained(MODEL_NAME, from_pt=True)

input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")
token_type_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="token_type_ids")

outputs = encoder({"input_ids": input_ids,
                   "attention_mask": attention_mask,
                   "token_type_ids": token_type_ids})
pooled_output = outputs.pooler_output

x = tf.keras.layers.Dropout(0.1)(pooled_output)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
out = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=out)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(name="accuracy"),
                       tf.keras.metrics.AUC(name="auc")])

model.summary()

# ---------------- TRAIN ----------------
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

# ---------------- EVALUATE ----------------
print("Evaluating...")
results = model.evaluate(test_ds)
print(dict(zip(model.metrics_names, results)))

y_pred_prob = model.predict(test_ds).ravel()
y_pred = (y_pred_prob >= 0.5).astype(int)

print("Accuracy:", accuracy_score(test_labels, y_pred))
print("AUC:", roc_auc_score(test_labels, y_pred_prob))
print(classification_report(test_labels, y_pred, digits=4))


Total examples: 7000
Train: 5040 Val: 560 Test: 1400


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 token_type_ids (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                            

