#Training

### Import necessary libraries

### Load the training dataset

### Preprocess the data
- Initialize a BERT tokenizer to convert text into tokens (small pieces of text) that the model can understand.

### Prepare data for training

- Create a DataLoader for the training dataset, using a RandomSampler to shuffle the data (helps in learning and generalization).

### Set up the model for sequence classification
- Load a pre-trained BERT model configured for sequence classification tasks.
- Specify the number of labels for the classification (2 for binary classification).to determine the class probabilities.

### Prepare the model for training
- Set up an optimizer (AdamW) for adjusting model parameters and a learning rate scheduler to adjust the learning rate over time.

### Define a function to train the model for one epoch
    - Switch the BERT model to training mode.
    - Initialize a variable to keep track of the total training loss.
    - For each batch of data:
        - Move the batch to the appropriate device.
        - Clear any previously calculated gradients.
        - Perform a forward pass through the model with the current batch of data.
        -Extract embeddings from the BERT model's last hidden layer
        - Calculate the loss (difference between model predictions and actual labels).
        - Accumulate this batch's loss.
        - Perform backpropagation to calculate gradients.
        - Clip gradients to prevent excessively large updates.
        - Update model parameters using the optimizer.
        - Update the learning rate using the scheduler.
    - Calculate the average loss over all batches.

Train the Logistic Regression classifier using the scaled embeddings and corresponding labels
 on the logits from the BERT model.

  

In [11]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import numpy as np
import joblib

#loading the datasets
train_df = pd.read_csv('train.csv')

#preprocess the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 256

def encode_data(df):
    input_ids = []
    attention_masks = []
    labels = []
    for _, row in df.iterrows():
        encoded_dict = tokenizer.encode_plus(
            row['Claim'] + ' [SEP] ' + row['Evidence'],
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        labels.append(row['label'])
    labels = torch.tensor(labels)
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return TensorDataset(input_ids, attention_masks, labels)

#create DataLoader
batch_size = 16
train_dataset = encode_data(train_df)
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

#setup the BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,  # Binary classification
    output_attentions=False,
    output_hidden_states=True  # Enable output of hidden states
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

#training setup
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 1
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train_epoch(model, dataloader):
    model.train()
    total_train_loss = 0
    all_embeddings = []
    all_labels = []
    for batch in tqdm(dataloader, desc="Training"):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        embeddings = outputs.hidden_states[-1][:, 0, :].detach().cpu().numpy()  # Extract [CLS] token embeddings
        all_embeddings.extend(embeddings)
        all_labels.extend(b_labels.detach().cpu().numpy())
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    return total_train_loss / len(dataloader), np.array(all_embeddings), np.array(all_labels)

#extract features and train Logistic Regression
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    train_loss, embeddings, labels = train_epoch(model, train_dataloader)
    print(f'Train loss: {train_loss}')
torch.save(model.state_dict(), "bert_for_sequence_classification.pth")

#initialize and train the Logistic Regression classifier
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(embeddings_scaled, labels)

#save the trained BERT, Logistic Regression, and scaler

joblib.dump(lr_classifier, 'lr_classifier.joblib')
joblib.dump(scaler, 'scaler.joblib')

print("Training complete. All models saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1


Training: 100%|██████████| 1482/1482 [08:16<00:00,  2.98it/s]


Train loss: 0.34335434407890647
Training complete. All models saved.


#Evaluation
####Load the evaluation (development) dataset
####Preprocess the evaluation data
- Use the previously defined `encode_data` function to:
    - Tokenize and encode each data point (a pair of text sequences) in the evaluation dataset.
    - Generate attention masks for the sequences.
    - Collect these into a TensorDataset object for efficient handling.

#### Prepare the evaluation dataset for processing

#### Define a function to evaluate the model on the dataset
- The function takes a model and a DataLoader as inputs.
- Switch the model to evaluation mode using `model.eval()` to inform the model that it is being evaluated, not trained. This disables dropout and batch normalization.

- Perform a forward pass through the model to get the output logits for the batch.
   
- Convert these logits to actual predictions using `torch.argmax()`, which selects the index (class label) with the highest logit value.



In [12]:
#evaluate
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import numpy as np
import joblib

# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define function to encode data
def encode_data(df, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    labels = []
    for _, row in df.iterrows():
        encoded_dict = tokenizer.encode_plus(
            row['Claim'] + ' [SEP] ' + row['Evidence'],
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        labels.append(row['label'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, labels)

# Evaluation function that incorporates the Logistic Regression classifier
def evaluate_model(model, lr_classifier, scaler, dataloader):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, output_hidden_states=True)

        # Extract the [CLS] token embeddings from the last hidden layer
        embeddings = outputs.hidden_states[-1][:, 0, :].detach().cpu().numpy()
        # Scale embeddings
        scaled_embeddings = scaler.transform(embeddings)
        # Logistic Regression predictions
        batch_predictions = lr_classifier.predict(scaled_embeddings)

        predictions.extend(batch_predictions)
        true_labels.extend(b_labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=['Class 0', 'Class 1'])

    return accuracy, report

# Load saved models and scaler
lr_classifier = joblib.load('lr_classifier.joblib')
scaler = joblib.load('scaler.joblib')

# Load data
eval_df = pd.read_csv('dev.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 256
eval_dataset = encode_data(eval_df, tokenizer, max_length)
eval_dataloader = DataLoader(
    eval_dataset,
    sampler=SequentialSampler(eval_dataset),
    batch_size=16
)

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('bert_for_sequence_classification.pth', map_location=device))
model.to(device)

# Evaluate the model using Logistic Regression
eval_accuracy, eval_report = evaluate_model(model, lr_classifier, scaler, eval_dataloader)
print(f"Validation Accuracy: {eval_accuracy}")
print(eval_report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 371/371 [00:43<00:00,  8.49it/s]

Validation Accuracy: 0.8655079311508607
              precision    recall  f1-score   support

     Class 0       0.92      0.89      0.91      4327
     Class 1       0.73      0.79      0.76      1599

    accuracy                           0.87      5926
   macro avg       0.83      0.84      0.83      5926
weighted avg       0.87      0.87      0.87      5926






In [14]:
#testing
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import numpy as np
import joblib

#initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#define function to encode data
def encode_data(df, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    labels = []
    for _, row in df.iterrows():
        encoded_dict = tokenizer.encode_plus(
            row['Claim'] + ' [SEP] ' + row['Evidence'],
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        labels.append(row['label'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, labels)

#evaluation function that incorporates the Logistic Regression classifier
def evaluate_model(model, lr_classifier, scaler, dataloader):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, output_hidden_states=True)

        # Extract the [CLS] token embeddings from the last hidden layer
        embeddings = outputs.hidden_states[-1][:, 0, :].detach().cpu().numpy()
        # Scale embeddings
        scaled_embeddings = scaler.transform(embeddings)
        # Logistic Regression predictions
        batch_predictions = lr_classifier.predict(scaled_embeddings)

        predictions.extend(batch_predictions)
        true_labels.extend(b_labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=['Class 0', 'Class 1'])

    return accuracy, report

#load saved models and scaler
lr_classifier = joblib.load('lr_classifier.joblib')
scaler = joblib.load('scaler.joblib')

#load data
eval_df = pd.read_csv('ED_trial.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 256
eval_dataset = encode_data(eval_df, tokenizer, max_length)
eval_dataloader = DataLoader(
    eval_dataset,
    sampler=SequentialSampler(eval_dataset),
    batch_size=16
)

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('bert_for_sequence_classification.pth', map_location=device))
model.to(device)

# Evaluate the model using Logistic Regression
eval_accuracy, eval_report = evaluate_model(model, lr_classifier, scaler, eval_dataloader)
print(f"Testing Accuracy: {eval_accuracy}")
print(eval_report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 4/4 [00:00<00:00, 11.62it/s]

Testing Accuracy: 0.9
              precision    recall  f1-score   support

     Class 0       0.97      0.89      0.93        35
     Class 1       0.78      0.93      0.85        15

    accuracy                           0.90        50
   macro avg       0.87      0.91      0.89        50
weighted avg       0.91      0.90      0.90        50






#Demo code
Replace the input file
###Ensure necessary packages installed:
 pip install transformers sklearn pandas torch

In [15]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib

# Part 1:data preparation, preprocessing
def load_and_process_test_data(test_file, tokenizer, max_length):
    test_df = pd.read_csv(test_file)
    input_ids = []
    attention_masks = []
    for _, row in test_df.iterrows():
        encoded_dict = tokenizer.encode_plus(
            str(row['Claim']) + ' [SEP] ' + str(row['Evidence']),
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return TensorDataset(input_ids, attention_masks)

# Part 2:model loading
def load_model(model_path, scaler_path, lr_path):
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=2,  # binary classification
        output_attentions=False,
        output_hidden_states=True  # Enable output of hidden states for feature extraction
    )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    scaler = joblib.load(scaler_path)
    lr_classifier = joblib.load(lr_path)
    return model, device, scaler, lr_classifier

# Part 3: generating predictions using Logistic Regression
def predict_model(model, dataloader, device, scaler, lr_classifier):
    model.eval()
    predictions = []
    for batch in dataloader:
        b_input_ids, b_input_mask = batch
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, output_hidden_states=True)
        embeddings = outputs.hidden_states[-1][:, 0, :].detach().cpu().numpy()
        scaled_embeddings = scaler.transform(embeddings)
        preds = lr_classifier.predict(scaled_embeddings)
        predictions.extend(preds)
    return predictions

# Part 4: save output to file
def save_predictions(predictions, output_file):
    pred_df = pd.DataFrame(predictions, columns=['prediction'])
    pred_df.to_csv(output_file, index=False)

# Main execution
def main(model_path, test_data_path, predictions_path, scaler_path, lr_path):
    # setup tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    max_length = 256

    # loading the model, scaler, and Logistic Regression classifier
    model, device, scaler, lr_classifier = load_model(model_path, scaler_path, lr_path)

    # load and process test data
    test_dataset = load_and_process_test_data(test_data_path, tokenizer, max_length)
    test_dataloader = DataLoader(
        test_dataset,
        sampler=SequentialSampler(test_dataset),
        batch_size=16
    )

    # generate predictions using Logistic Regression
    predictions = predict_model(model, test_dataloader, device, scaler, lr_classifier)

    # save predictions to a file
    save_predictions(predictions, predictions_path)
    print(f"Predictions saved to {predictions_path}")

if __name__ == '__main__':
    model_path = 'bert_for_sequence_classification.pth'
    test_data_path = 'test.csv'  # update with the path to test data
    predictions_path = 'Group_56_C.csv'
    scaler_path = 'scaler.joblib'
    lr_path = 'lr_classifier.joblib'  # path to the Logistic Regression model
    main(model_path, test_data_path, predictions_path, scaler_path, lr_path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions saved to Group_56_C.csv
