In [5]:
pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.5.1-cp312-cp312-win_amd64.whl.metadata (12 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.14.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Using cached scikit_learn-1.5.1-cp312-cp312-win_amd64.whl (10.9 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.14.0-cp312-cp312-win_amd64.whl (44.5 MB)
Installing collected packages: scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.1 scipy-1.14.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
import torch
import os
import csv
import numpy as np
from sklearn.model_selection import train_test_split

# Initial training data (manually labeled)
initial_data = [
    {"text": "Bownes ideas had many predecessors, from Latin Christianity through Immanuel Kant, using many different theories and concepts, about what a human being is and about the personhood of God in its relation to our own personhood. His forceful argumentation influenced James, who helped found the American philosophical tradition of pragmatism shortly after Bowne’s first books were published and who drew increasingly close to personalism, as did the idealist philosopher Josiah Royce. Bowne was at the centre of this troika of canonical American philosophers at the turn of the 20th century. His teaching rippled out through personalist philosophers on the West Coast and through his students at Boston, notably Edgar S Brightman and Harold DeWulf, both of whom later became teachers of King.", "label": "Human"},
    {"text": "Spanish political parties mobilize against the ETA, planning a meeting to unite all democratic forces in their anti-terrorism efforts. This move comes amid a series of violent events involving the Basque separatist group ETA, which has been responsible for nearly 800 deaths since 1968 in its fight for an independent Basque state. Recent incidents include three non-fatal bombs in Malaga, the murders of a town councilor and a retired Civil Guard officer, and a fatal bombing targeting another town councilor. There are demonstrations demanding a negotiated solution to the violence, while Spain's Socialist party exits the Basque coalition government due to its connections with radical groups. Meanwhile, violent clashes with police and accusations from ETA against Spain and France of attempting to eliminate the Basque language highlight ongoing tensions. Elections in Spain's Basque region and a cease-fire announcement by ETA introduce a potential turn towards peace, supported internationally by figures like Gerry Adams, who urges US involvement in facilitating a resolution. However, Spain's government refuses to negotiate Basque independence in peace talks, considering compensation for victims of anti-Basque death squads from the 1980s and initiating steps towards peace talks with ETA. Basque separatists reaffirm their willingness to negotiate without fully committing to disarmament. The articles reflect a complex web of political maneuvers, violence, cultural struggle, and international attention surrounding the Basque separatist movement, marking a crucial phase in Spain's efforts to address regional unrest and move toward resolution.", "label": "AI"}
]

# Load pre-trained BERT and RoBERTa models and tokenizers
bert_model_name = "bert-base-uncased"
roberta_model_name = "roberta-base"
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_model_name)
bert_model = BertForSequenceClassification.from_pretrained(bert_model_name)
roberta_model = RobertaForSequenceClassification.from_pretrained(roberta_model_name)

# Tokenize and encode the texts
def encode_texts(texts, tokenizer):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Prepare initial dataset
def get_initial_dataset():
    texts = [item["text"] for item in initial_data]
    labels = [1 if item["label"] == "AI" else 0 for item in initial_data]
    return texts, labels

# Initial training of the models
def initial_training():
    texts, labels = get_initial_dataset()
    encoded_texts_bert = encode_texts(texts, bert_tokenizer)
    encoded_texts_roberta = encode_texts(texts, roberta_tokenizer)
    y_train = torch.tensor(labels)

    # Fine-tune BERT
    bert_model.train()
    optimizer_bert = torch.optim.Adam(bert_model.parameters(), lr=1e-5)
    for epoch in range(3):  # Adjust the number of epochs as needed
        optimizer_bert.zero_grad()
        outputs_bert = bert_model(**encoded_texts_bert, labels=y_train)
        loss_bert = outputs_bert.loss
        loss_bert.backward()
        optimizer_bert.step()
    bert_model.eval()

    # Fine-tune RoBERTa
    roberta_model.train()
    optimizer_roberta = torch.optim.Adam(roberta_model.parameters(), lr=1e-5)
    for epoch in range(3):  # Adjust the number of epochs as needed
        optimizer_roberta.zero_grad()
        outputs_roberta = roberta_model(**encoded_texts_roberta, labels=y_train)
        loss_roberta = outputs_roberta.loss
        loss_roberta.backward()
        optimizer_roberta.step()
    roberta_model.eval()

# Evaluate the models
def evaluate_model(model, tokenizer, texts, labels):
    encoded_texts = encode_texts(texts, tokenizer)
    y_true = torch.tensor(labels)
    model.eval()
    with torch.no_grad():
        outputs = model(**encoded_texts)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        accuracy = (preds == y_true).float().mean().item()
    return accuracy

# Active Learning Loop with BERT and RoBERTa
def active_learning_loop_bert_roberta(input_directory, output_directory, results_file, iterations=10):
    results = []
    texts, labels = get_initial_dataset()
    y_train = torch.tensor(labels)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for iteration in range(iterations):
        print(f"Iteration {iteration+1}/{iterations}")
        new_texts = []
        new_labels = []
        high_confidence_samples = []

        for filename in os.listdir(input_directory):
            if filename.endswith(".txt"):
                summary_path = os.path.join(input_directory, filename)
                with open(summary_path, 'r', encoding='utf-8') as file:
                    summary_content = file.read()
                    
                    # BERT predictions
                    encoded_input_bert = encode_texts([summary_content], bert_tokenizer)
                    with torch.no_grad():
                        outputs_bert = bert_model(**encoded_input_bert)
                        logits_bert = outputs_bert.logits
                        probs_bert = torch.nn.functional.softmax(logits_bert, dim=-1)
                        confidence_bert, predicted_class_bert = torch.max(probs_bert, dim=1)
                    
                    # RoBERTa predictions
                    encoded_input_roberta = encode_texts([summary_content], roberta_tokenizer)
                    with torch.no_grad():
                        outputs_roberta = roberta_model(**encoded_input_roberta)
                        logits_roberta = outputs_roberta.logits
                        probs_roberta = torch.nn.functional.softmax(logits_roberta, dim=-1)
                        confidence_roberta, predicted_class_roberta = torch.max(probs_roberta, dim=1)
                    
                    # Average confidence
                    avg_confidence = (confidence_bert.item() + confidence_roberta.item()) / 2
                    avg_predicted_class = 1 if (predicted_class_bert.item() + predicted_class_roberta.item()) / 2 >= 0.5 else 0

                    if avg_confidence >= 0.7:  # Adjust confidence threshold as needed
                        high_confidence_samples.append((summary_content, avg_predicted_class, avg_confidence))
                    
                    results.append([filename, "AI" if avg_predicted_class == 1 else "Human", avg_confidence])
        
        for sample in high_confidence_samples:
            summary_content, avg_predicted_class, avg_confidence = sample
            new_texts.append(summary_content)
            new_labels.append(avg_predicted_class)
        
        if new_texts:
            encoded_new_texts_bert = encode_texts(new_texts, bert_tokenizer)
            encoded_new_texts_roberta = encode_texts(new_texts, roberta_tokenizer)
            y_new = torch.tensor(new_labels)
            texts.extend(new_texts)
            y_train = torch.cat((y_train, y_new), dim=0)

            # Fine-tune BERT
            encoded_texts_bert = encode_texts(texts, bert_tokenizer)
            bert_model.train()
            optimizer_bert = torch.optim.Adam(bert_model.parameters(), lr=1e-5)
            for epoch in range(3):  # Adjust the number of epochs as needed
                optimizer_bert.zero_grad()
                outputs_bert = bert_model(**encoded_texts_bert, labels=y_train)
                loss_bert = outputs_bert.loss
                loss_bert.backward()
                optimizer_bert.step()
            bert_model.eval()

            # Fine-tune RoBERTa
            encoded_texts_roberta = encode_texts(texts, roberta_tokenizer)
            roberta_model.train()
            optimizer_roberta = torch.optim.Adam(roberta_model.parameters(), lr=1e-5)
            for epoch in range(3):  # Adjust the number of epochs as needed
                optimizer_roberta.zero_grad()
                outputs_roberta = roberta_model(**encoded_texts_roberta, labels=y_train)
                loss_roberta = outputs_roberta.loss
                loss_roberta.backward()
                optimizer_roberta.step()
            roberta_model.eval()

            # Evaluate models
            accuracy_bert = evaluate_model(bert_model, bert_tokenizer, texts, labels)
            accuracy_roberta = evaluate_model(roberta_model, roberta_tokenizer, texts, labels)
            print(f"Iteration {iteration+1} - BERT Accuracy: {accuracy_bert:.4f}, RoBERTa Accuracy: {accuracy_roberta:.4f}")

    # Save results to CSV
    with open(results_file, 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filename', 'Prediction', 'Confidence'])
        writer.writerows(results)

# Main execution
input_directory = "C:/Users/shouv/Desktop/Research/NIST/GenAI24-NIST-pilot-T2T-D-set-1/GenAI24-NIST-pilot-T2T-D-set-1/files/"
output_directory = "C:/Users/shouv/Desktop/Research/NIST/GenAI24-NIST-pilot-T2T-D-set-1/GenAI24-NIST-pilot-T2T-D-set-1/results/"
results_file = "C:/Users/shouv/Desktop/Research/NIST/GenAI24-NIST-pilot-T2T-D-set-1/GenAI24-NIST-pilot-T2T-D-set-1/results.csv"

# Initial training
initial_training()

# Active learning loop
active_learning_loop_bert_roberta(input_directory, output_directory, results_file)

print("Results saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 1/10
Iteration 2/10
Iteration 3/10
Iteration 4/10
Iteration 5/10
Iteration 6/10
Iteration 7/10
Iteration 8/10
Iteration 9/10
Iteration 10/10
Results saved.
