In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import time
from datetime import datetime

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import KFold

category_columns = [
    "Unlawful detention",
    "Human trafficking",
    "Enslavement",
    "Willful killing of civilians",
    "Mass execution",
    "Kidnapping",
    "Extrajudicial killing",
    "Forced disappearance",
    "Damage or destruction of civilian critical infrastructure",
    "Damage or destruction, looting, or theft of cultural heritage",
    "Military operations (battle, shelling)",
    "Gender-based or other conflict-related sexual violence",
    "Violent crackdowns on protesters/opponents/civil rights abuse",
    "Indiscriminate use of weapons",
    "Torture or indications of torture",
    "Persecution based on political, racial, ethnic, gender, or sexual orientation",
    "Movement of military, paramilitary, or other troops and equipment"
]

# 2) Custom Dataset class for articles
class ArticleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

     

train_df = pd.read_csv("train.csv")  
val_df   = pd.read_csv("val.csv")    
test_df  = pd.read_csv("test.csv")  

# Combine all datasets for k-fold cross-validation
all_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text columns
train_encodings = tokenizer(
    list(train_df["Incident Narrative"].values),
    truncation=True,
    padding=True
)
val_encodings   = tokenizer(
    list(val_df["Incident Narrative"].values),
    truncation=True,
    padding=True
)
test_encodings  = tokenizer(
    list(test_df["Incident Narrative"].values),
    truncation=True,
    padding=True
)

# Extract labels (multi-label targets in your category columns)
train_labels = train_df[category_columns].values
val_labels   = val_df[category_columns].values
test_labels  = test_df[category_columns].values

# Create Dataset objects
train_dataset = ArticleDataset(train_encodings, train_labels)
val_dataset   = ArticleDataset(val_encodings, val_labels)
test_dataset  = ArticleDataset(test_encodings, test_labels)

     

# Note: num_labels = number of category columns for multi-label classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=len(category_columns)
)

# Define compute_metrics for multi-label classification
def compute_metrics(p):
    # p.predictions are logits; p.label_ids are the ground truth
    preds = torch.sigmoid(torch.tensor(p.predictions))  # Convert logits to probabilities
    preds = (preds > 0.5).int().cpu().numpy() 
    labels = torch.tensor(p.label_ids).cpu().numpy()
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average='weighted'
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='D:/Yahya/classification/results',          # Output directory
    eval_strategy="epoch",           # Evaluate at the end of each epoch
    save_strategy="epoch",           # Save model at the end of each epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",      # Use F1 score for best model
    logging_dir='D:/Yahya/classification/logs'
)

# Initialize Trainer with training and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
print(f"\n{'='*60}")
print(f"STARTING INITIAL TRAINING")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")

training_start_time = time.time()
trainer.train()
training_end_time = time.time()
training_duration = training_end_time - training_start_time

print(f"\n{'='*60}")
print(f"INITIAL TRAINING COMPLETED")
print(f"End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Training duration: {training_duration:.2f} seconds ({training_duration/60:.2f} minutes)")
print(f"{'='*60}")

# Save the model
trainer.save_model("D:/Yahya/classification/bert-multiclass-model")

     

def evaluate_model(dataset, threshold=0.5):
    loader = DataLoader(dataset, batch_size=16)
    model.eval()  # Set model to eval mode
    
    preds_list = []
    labels_list = []
    
    with torch.no_grad():
        for batch in loader:
            inputs = {
                key: val.to(model.device) for key, val in batch.items() if key != 'labels'
            }
            outputs = model(**inputs)
            logits = outputs.logits
            # Convert logits to probabilities
            probs = torch.sigmoid(logits).cpu().numpy()
            # Apply threshold
            preds = (probs > threshold).astype(int)
            
            preds_list.extend(preds)
            labels_list.extend(batch['labels'].cpu().numpy())
    
    preds_array = np.array(preds_list)
    labels_array = np.array(labels_list)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels_array, preds_array, average='weighted')
    acc = accuracy_score(labels_array, preds_array)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Run evaluation on the test dataset
print(f"\n{'='*60}")
print(f"STARTING INITIAL TEST EVALUATION")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*60}")

test_inference_start_time = time.time()
test_results = evaluate_model(test_dataset)
test_inference_end_time = time.time()
test_inference_duration = test_inference_end_time - test_inference_start_time

print("Final Test Set Evaluation Results:", test_results)
print(f"Test inference duration: {test_inference_duration:.2f} seconds")
print(f"Test set size: {len(test_dataset)} articles")
print(f"Inference speed: {len(test_dataset)/test_inference_duration:.2f} articles/second")

# 5-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []

print(f"\n{'='*60}")
print(f"STARTING 5-FOLD CROSS-VALIDATION")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total dataset size: {len(all_df)} articles")
print(f"{'='*60}")

cv_start_time = time.time()
fold_times = []

for fold, (train_idx, test_idx) in enumerate(kfold.split(all_df)):
    fold_start_time = time.time()
    print(f"\nFold {fold + 1}/5 - Start time: {datetime.now().strftime('%H:%M:%S')}")
    print(f"Train size: {len(train_idx)}, Test size: {len(test_idx)}")
    
    # Split data for this fold
    fold_train_df = all_df.iloc[train_idx].reset_index(drop=True)
    fold_test_df = all_df.iloc[test_idx].reset_index(drop=True)
    
    # Tokenize the text columns for this fold
    fold_train_encodings = tokenizer(
        list(fold_train_df["Incident Narrative"].values),
        truncation=True,
        padding=True
    )
    fold_test_encodings = tokenizer(
        list(fold_test_df["Incident Narrative"].values),
        truncation=True,
        padding=True
    )
    
    # Extract labels for this fold
    fold_train_labels = fold_train_df[category_columns].values
    fold_test_labels = fold_test_df[category_columns].values
    
    # Create Dataset objects for this fold
    fold_train_dataset = ArticleDataset(fold_train_encodings, fold_train_labels)
    fold_test_dataset = ArticleDataset(fold_test_encodings, fold_test_labels)
    
    # Train model for this fold
    fold_model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels=len(category_columns)
    )
    
    fold_training_args = TrainingArguments(
        output_dir=f'D:/Yahya/classification/fold_{fold}_results', 
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=30,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir=f'D:/Yahya/classification/fold_{fold}_logs',
        disable_tqdm=True
    )
    
    fold_trainer = Trainer(
        model=fold_model,
        args=fold_training_args,
        train_dataset=fold_train_dataset,
        eval_dataset=fold_test_dataset,
        compute_metrics=compute_metrics
    )
    
    # Train the model for this fold
    fold_training_start = time.time()
    fold_trainer.train()
    fold_training_end = time.time()
    fold_training_duration = fold_training_end - fold_training_start
    
    # Evaluate on test set for this fold
    fold_inference_start = time.time()
    fold_results = fold_trainer.evaluate(fold_test_dataset)
    fold_inference_end = time.time()
    fold_inference_duration = fold_inference_end - fold_inference_start
    
    fold_end_time = time.time()
    fold_total_duration = fold_end_time - fold_start_time
    fold_times.append(fold_total_duration)
    
    cv_results.append(fold_results)
    
    print(f"Fold {fold + 1} Results:")
    print(f"  Accuracy: {fold_results['eval_accuracy']:.4f}")
    print(f"  F1: {fold_results['eval_f1']:.4f}")
    print(f"  Precision: {fold_results['eval_precision']:.4f}")
    print(f"  Recall: {fold_results['eval_recall']:.4f}")
    print(f"  Training time: {fold_training_duration:.2f} seconds ({fold_training_duration/60:.2f} minutes)")
    print(f"  Inference time: {fold_inference_duration:.2f} seconds")
    print(f"  Total fold time: {fold_total_duration:.2f} seconds ({fold_total_duration/60:.2f} minutes)")
    print(f"  Inference speed: {len(fold_test_dataset)/fold_inference_duration:.2f} articles/second")

cv_end_time = time.time()
cv_total_duration = cv_end_time - cv_start_time

# Calculate and display cross-validation statistics
print(f"\n{'='*60}")
print("5-FOLD CROSS-VALIDATION RESULTS")
print(f"End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total CV duration: {cv_total_duration:.2f} seconds ({cv_total_duration/60:.2f} minutes)")
print(f"Average time per fold: {np.mean(fold_times):.2f} seconds ({np.mean(fold_times)/60:.2f} minutes)")
print(f"{'='*60}")

cv_accuracy = [result['eval_accuracy'] for result in cv_results]
cv_f1 = [result['eval_f1'] for result in cv_results]
cv_precision = [result['eval_precision'] for result in cv_results]
cv_recall = [result['eval_recall'] for result in cv_results]

print(f"Accuracy: {np.mean(cv_accuracy):.4f} ± {np.std(cv_accuracy):.4f}")
print(f"F1 Score: {np.mean(cv_f1):.4f} ± {np.std(cv_f1):.4f}")
print(f"Precision: {np.mean(cv_precision):.4f} ± {np.std(cv_precision):.4f}")
print(f"Recall: {np.mean(cv_recall):.4f} ± {np.std(cv_recall):.4f}")

print(f"\nIndividual Fold Results:")
for i, result in enumerate(cv_results):
    print(f"Fold {i+1}: Acc={result['eval_accuracy']:.4f}, F1={result['eval_f1']:.4f}, "
          f"Prec={result['eval_precision']:.4f}, Rec={result['eval_recall']:.4f}, Time={fold_times[i]/60:.2f}min")

print(f"\nNote: Cross-validation addresses the limitation of the original 90%/10% split")
print(f"which resulted in only 43 test articles. This approach uses all {len(all_df)} articles")
print(f"for evaluation across 5 folds, providing more robust statistical significance.")

# Save cross-validation results to file
cv_summary = {
    'Mean_Accuracy': np.mean(cv_accuracy),
    'Std_Accuracy': np.std(cv_accuracy),
    'Mean_F1': np.mean(cv_f1),
    'Std_F1': np.std(cv_f1),
    'Mean_Precision': np.mean(cv_precision),
    'Std_Precision': np.std(cv_precision),
    'Mean_Recall': np.mean(cv_recall),
    'Std_Recall': np.std(cv_recall),
    'Initial_Training_Time_Seconds': training_duration,
    'Initial_Training_Time_Minutes': training_duration/60,
    'Test_Inference_Time_Seconds': test_inference_duration,
    'Test_Inference_Speed_Articles_Per_Second': len(test_dataset)/test_inference_duration,
    'CV_Total_Time_Seconds': cv_total_duration,
    'CV_Total_Time_Minutes': cv_total_duration/60,
    'CV_Average_Fold_Time_Seconds': np.mean(fold_times),
    'CV_Average_Fold_Time_Minutes': np.mean(fold_times)/60,
    'Total_Articles': len(all_df),
    'Test_Set_Size': len(test_dataset),
    'Experiment_Date': datetime.now().strftime('%Y-%m-%d'),
    'Experiment_Time': datetime.now().strftime('%H:%M:%S')
}

cv_summary_df = pd.DataFrame([cv_summary])
cv_summary_df.to_csv("D:/Yahya/classification/cv_results_summary.csv", index=False)

print(f"\nCross-validation results saved to: D:/Yahya/classification/cv_results_summary.csv")

# Print final timing summary
print(f"\n{'='*60}")
print("FINAL TIMING SUMMARY")
print(f"{'='*60}")
print(f"Initial Training Time: {training_duration/60:.2f} minutes")
print(f"Initial Test Inference Time: {test_inference_duration:.2f} seconds")
print(f"5-Fold CV Total Time: {cv_total_duration/60:.2f} minutes")
print(f"Total Experiment Time: {(training_duration + test_inference_duration + cv_total_duration)/60:.2f} minutes")
print(f"{'='*60}")