In [1]:
import pandas as pd

train_url = "/kaggle/input/custom/train.csv"
test_url = "/kaggle/input/custom/test.csv"
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

train['target'].unique()

array(['academic interests', 'arts and culture', 'automotives',
       'books and literature', 'business and finance', 'careers',
       'family and relationships', 'food and drinks', 'health',
       'healthy living', 'hobbies and interests', 'home and garden',
       'movies', 'music and audio', 'news and politics',
       'personal finance', 'pets',
       'pharmaceuticals, conditions, and symptoms', 'real estate',
       'shopping', 'sports', 'style and fashion',
       'technology and computing', 'television', 'travel', 'video gaming'],
      dtype=object)

In [2]:
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

In [3]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target column
train['target_label'] = label_encoder.fit_transform(train['target'])
test['target_label'] = label_encoder.fit_transform(test['target'])

In [4]:
train = train[['text', 'target_label']]
test = test[['text', 'target_label']]

In [5]:
!pip install tensorflow transformers accelerate datasets



In [6]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from datasets import load_dataset

In [7]:
model_name = "bert-base-uncased"
model = TFAutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [8]:
from datasets import Dataset, DatasetDict

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

# Create a DatasetDict
final_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print(final_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'target_label'],
        num_rows: 26000
    })
    test: Dataset({
        features: ['text', 'target_label'],
        num_rows: 2600
    })
})


In [9]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

In [10]:
final_dataset_encoded = final_dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/26000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2600 [00:00<?, ? examples/s]

In [11]:
final_dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'target_label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 26000
    })
    test: Dataset({
        features: ['text', 'target_label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2600
    })
})

In [12]:
final_dataset_encoded.set_format('tf', 
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'target_label'])

# setting BATCH_SIZE to 64.
BATCH_SIZE = 8

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
        'token_type_ids': data[3]
    }, data[0]

# converting train split of `emotions_encoded` to tensorflow format
train_dataset = tf.data.Dataset.from_tensor_slices(final_dataset_encoded['train'][:])
# set batch_size and shuffle
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)
# map the `order` function
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# ... doing the same for test set ...
test_dataset = tf.data.Dataset.from_tensor_slices(final_dataset_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)


In [13]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[  101,  1037,  2450, ...,     0,     0,     0],
       [  101,  5003, 15689, ...,     0,     0,     0],
       [  101,  6816, 14021, ...,     0,     0,     0],
       ...,
       [  101,  8398,  7126, ...,  2335,  2015,   102],
       [  101,  3604,  9117, ...,  4008, 17788,   102],
       [  101,  2944,  1998, ...,     0,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'token_type_ids': <tf.Tensor: shape=(8, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0]])>} 

 tf.Tensor([10 21  6  8  9 14  2 12], s

In [14]:
class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.bert(inputs)[0]
        x = tf.keras.layers.GlobalAveragePooling1D()(x)
        return self.fc(x)

In [15]:
classifier = BERTForClassification(model, num_classes=26)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [16]:
history = classifier.fit(train_dataset, epochs=5)

Epoch 1/5


I0000 00:00:1727485729.466543     109 service.cc:145] XLA service 0x7a320bf0be60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1727485729.466596     109 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1727485729.466600     109 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1727485729.650041     109 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
classifier.evaluate(test_dataset)



[0.06902214884757996, 0.9796153903007507]

In [118]:
# Loading the submission csv file
submission_df = pd.read_csv("/kaggle/input/fibe-test/test.csv", encoding='ISO-8859-1')

In [119]:
text_submission_list = submission_df["text"].tolist()

In [120]:
def predict_label(text):
    tokens = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="tf")
    prediction = classifier(tokens)
    predicted_class = tf.argmax(prediction, axis=1).numpy()[0]
    original_label = label_encoder.inverse_transform([predicted_class])[0]
    return original_label

In [121]:
predict_label(text_submission_list[7])

'news and politics'

In [122]:
len(text_submission_list)

174382

In [130]:
import numpy as np
from tqdm import tqdm

# Set batch size
BATCH_SIZE = 32  # Adjust this based on your GPU memory

# Function to create batches
def batch_texts(texts, batch_size):
    for i in range(0, len(texts), batch_size):
        yield texts[i:i + batch_size]

# Optimized prediction function
def predict_labels_batch(texts):
    # Tokenize the batch
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="tf")
    
    # Make predictions
    predictions = classifier(inputs)
    predicted_classes = tf.argmax(predictions, axis=1).numpy()
    
    # Convert to original labels
    original_labels = label_encoder.inverse_transform(predicted_classes)
    
    return original_labels.tolist()

# Create empty list for predictions
target_predictions = []

# Batch prediction loop
for batch in tqdm(batch_texts(text_submission_list, BATCH_SIZE), total=len(text_submission_list)//BATCH_SIZE + 1, desc="Generating predictions"):
    batch_predictions = predict_labels_batch(batch)
    target_predictions.extend(batch_predictions)

Generating predictions: 100%|██████████| 5450/5450 [25:50<00:00,  3.51it/s]


In [131]:
target_predictions[:10]

['technology and computing',
 'hobbies and interests',
 'hobbies and interests',
 'academic interests',
 'academic interests',
 'hobbies and interests',
 'academic interests',
 'news and politics',
 'academic interests',
 'academic interests']

In [133]:
new_df = pd.DataFrame({
    'target': target_predictions,
    'Index': submission_df['Index']
})

# Ensure the columns are in the correct order
new_df = new_df[['target', 'Index']]

In [134]:
new_df

Unnamed: 0,target,Index
0,technology and computing,Article_0
1,hobbies and interests,Article_1
2,hobbies and interests,Article_2
3,academic interests,Article_3
4,academic interests,Article_4
...,...,...
174377,video gaming,Article_174377
174378,arts and culture,Article_174378
174379,video gaming,Article_174379
174380,video gaming,Article_174380


In [135]:
# Save the new DataFrame as a CSV file in the Kaggle output directory
output_path = '/kaggle/working/submission_results.csv'
new_df.to_csv(output_path, index=False)

print(f"File saved to {output_path}")

File saved to /kaggle/working/submission_results.csv


In [None]:
model_save_path = '/kaggle/working/bert_classifier'

# Save the entire model (including the custom layers)
classifier.save(model_save_path)

# Save the tokenizer separately
tokenizer.save_pretrained(model_save_path)

print(f"Classifier model saved to {model_save_path}")
print(f"Tokenizer saved to {model_save_path}")