In [None]:
!pip install transformers



In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import BertConfig
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [None]:
import pandas as pd
training_data = pd.read_csv('/content/train_data_text_ver1.csv')
val_data = pd.read_csv('/content/val_data_text_ver1.csv')
test_data = pd.read_csv('/content/test_data_text_ver1.csv')

In [None]:
train_texts = list(training_data['prompt_text'])
train_labels = list(training_data['approved'])
val_texts =  list(val_data['prompt_text'])
val_labels =  list(val_data['approved'])

In [None]:
# Assuming you have text in a list and corresponding labels in another list


texts = train_texts + val_texts
labels = train_labels + val_labels

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the input text
input_ids = []
attention_masks = []
for text in texts:
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,  # You can adjust this to your desired sequence length
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors="tf",
        truncation=True
    )
    input_ids.append(encoding["input_ids"])
    attention_masks.append(encoding["attention_mask"])

input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)

# Create a TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": input_ids, "attention_mask": attention_masks}, labels))




In [None]:
# Assuming you have a dataset of 80% for training, 10% for validation, and 10% for testing
dataset_size = len(texts)
train_size = dataset_size
train_dataset = dataset.take(train_size)

In [None]:
# Load a pre-trained BERT model for sequence classification
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define optimizer, loss function, and metrics
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
history = model.fit(
    train_dataset.shuffle(1000).batch(16),
    epochs=5
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Assuming you have test_texts as a list of text samples
test_texts = list(test_data['prompt_text'])

input_ids = []
attention_masks = []
for text in test_texts:
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,  # Adjust this based on your training settings
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors="tf",
        truncation=True
    )
    input_ids.append(encoding["input_ids"])
    attention_masks.append(encoding["attention_mask"])

input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)


In [None]:
predictions = model.predict({"input_ids": input_ids, "attention_mask": attention_masks})

# Extract the predicted labels (class probabilities) from the model's output
predicted_labels = tf.argmax(predictions.logits, axis=-1).numpy()




In [12]:
test_data['approved'] = predicted_labels

In [13]:
test_data[['id','approved']].to_csv('submission_bert.csv',index=False)