In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction
)
import torch

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/sentence_contexts_with_missing_3_sentences.csv") # Replace with your CSV path

# Map labels to integers
label_map = {"Primary": 0, "Secondary": 1, "Missing": 2}
df['label'] = df['type'].map(label_map)

# Split dataset
train_df, eval_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

# Replace NaN context_text with empty string
train_df['context_text'] = train_df['context_text'].fillna("")

# Do the same for eval_df
eval_df['context_text'] = eval_df['context_text'].fillna("")


# Load tokenizer and model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)

# Tokenization function
def tokenize_function(examples):
    
    return tokenizer(
        examples,
        padding="max_length",
        truncation=True,
        max_length=512  # Adjusted for your context length
    )

# Prepare datasets
train_encodings = tokenize_function(train_df['context_text'].tolist())
eval_encodings = tokenize_function(eval_df['context_text'].tolist())

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_df['label'].values)
eval_dataset = CustomDataset(eval_encodings, eval_df['label'].values)

# Metrics computation
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    precision = precision_score(p.label_ids, preds, average='weighted')
    recall = recall_score(p.label_ids, preds, average='weighted')
    f1 = f1_score(p.label_ids, preds, average='weighted')
    accuracy = accuracy_score(p.label_ids, preds)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    learning_rate= 2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='f1',  # You can change this to any metric you prefer
    greater_is_better=True,
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

# Print final evaluation metrics
eval_results = trainer.evaluate()
print("Final evaluation results:")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"F1 Score: {eval_results['eval_f1']:.4f}")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6721,0.472351,0.836449,0.824471,0.840451,0.836449


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6721,0.472351,0.836449,0.824471,0.840451,0.836449
2,0.5583,0.346666,0.892523,0.888419,0.905634,0.892523
3,0.2513,0.298656,0.911215,0.909971,0.923422,0.911215


Final evaluation results:
Accuracy: 0.9112
F1 Score: 0.9100
Precision: 0.9234
Recall: 0.9112


In [None]:
# Save the model
model.save_pretrained("/content/drive/MyDrive/FT-classification_1066_samples")
tokenizer.save_pretrained("/content/drive/MyDrive/FT-classification_1066_samples")

('/content/drive/MyDrive/FT-classification_1066_samples/tokenizer_config.json',
 '/content/drive/MyDrive/FT-classification_1066_samples/special_tokens_map.json',
 '/content/drive/MyDrive/FT-classification_1066_samples/vocab.txt',
 '/content/drive/MyDrive/FT-classification_1066_samples/added_tokens.json',
 '/content/drive/MyDrive/FT-classification_1066_samples/tokenizer.json')

In [15]:
# Prepare full dataset
# Prepare full dataset
df = pd.read_csv("/content/drive/MyDrive/sentence_contexts_with_missing_3_sentences.csv")

# Map labels to integers
label_map = {"Primary": 0, "Secondary": 1, "Missing": 2}
df['label'] = df['type'].map(label_map)

# Replace NaN context_text with empty string
df['context_text'] = df['context_text'].fillna("")


full_encodings = tokenize_function(df['context_text'].tolist())

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples,
        padding="max_length",
        truncation=True,
        max_length=512  # Adjusted for your context length
    )

best_model_path = "/content/drive/MyDrive/FT-classification_1066_samples"
tokenizer = AutoTokenizer.from_pretrained(best_model_path)
model = AutoModelForSequenceClassification.from_pretrained(
    best_model_path,
    num_labels=3
)



full_encodings = tokenize_function(df['context_text'].tolist())

full_dataset = CustomDataset(full_encodings, df['label'].values)



# Training arguments for full training
training_args = TrainingArguments(
    output_dir='./full_model_results',
    learning_rate= 2e-5,
    num_train_epochs=4,  # You might want to increase epochs slightly
    per_device_train_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

# Initialize Trainer with full dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_dataset,
    tokenizer=tokenizer
)

# Start training on full dataset
trainer.train()

# Save the final model
trainer.save_model("/content/drive/MyDrive/final_data_citation_classification_model")
tokenizer.save_pretrained("/content/drive/MyDrive/final_data_citation_classification_model")

print("Training completed! Model saved to './final_scibert_data_citation_model'")

  trainer = Trainer(


Step,Training Loss
10,0.2442
20,0.3602
30,0.3065
40,0.243
50,0.294
60,0.3651
70,0.3095
80,0.2071
90,0.4049
100,0.4564


Training completed! Model saved to './final_scibert_data_citation_model'
