In [2]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m897.5/897.5 kB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [4]:

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BioGptTokenizer, BioGptForCausalLM
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load data
train_variants = pd.read_csv("training_variants.csv")
train_text = pd.read_csv("training_text.csv", sep='\|\|', engine='python', header=None, skiprows=1, names=['ID', 'Text'])
train = pd.merge(train_variants, train_text, on='ID', how='inner')


# Remove empty texts
train = train[train['Text'] != ""]

# Ensure labels are between 1 and 9
assert train['Class'].min() == 1 and train['Class'].max() == 9, "Class labels are not in the range 1-9"

# Ensure all text values are strings
train['Text'] = train['Text'].astype(str).fillna("")

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")





In [None]:
def basic_stats(df):
    print("Dataset Shape", df.shape)
    print(f"\nMissing Vlaues: \n{df.isnull().sum()} ")
    print(f"\nClass Distribution:{df['Class'].value_counts().sort_index()}")

    #Gene statistics
    print("\nNumber of unique genes: ", df['Gene'].nunique())
    print(f"\nTop 10 most common genes:{df['Gene'].value_counts().head(10)} ")

    #Variation statistics
    print(f"\nNumber of unique variations:{df['Variation'].nunique()}")
    print(f"\nTop 10 most common variations: {df['Variation'].value_counts().head(10)}")

print("=== Basic Statistics ===")
basic_stats(train)

In [5]:


# Count the occurrences of each class label
class_counts = train['Class'].value_counts()

class_counts

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
7,953
4,686
1,568
2,452
6,275
5,242
3,89
9,37
8,19


In [None]:
train

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...
...,...,...,...,...,...
3316,3316,RUNX1,D171N,4,Introduction Myelodysplastic syndromes (MDS) ...
3317,3317,RUNX1,A122*,1,Introduction Myelodysplastic syndromes (MDS) ...
3318,3318,RUNX1,Fusions,1,The Runt-related transcription factor 1 gene (...
3319,3319,RUNX1,R80C,4,The RUNX1/AML1 gene is the most frequent targe...


In [None]:
Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BioGPT tokenizer and model for text generation
biogpt_model_name = "microsoft/biogpt"
biogpt_tokenizer = BioGptTokenizer.from_pretrained(biogpt_model_name)
biogpt_model = BioGptForCausalLM.from_pretrained(biogpt_model_name).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/927k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/696k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

In [8]:

# Update generate_text with additional parameters
def generate_text(prompt, max_new_tokens=150, temperature=0.7, top_p=0.95):
    inputs = biogpt_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).input_ids.to(device)

    try:
        outputs = biogpt_model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            num_return_sequences=1,
            do_sample=True
        )
        return biogpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Generation error: {str(e)}")
        return prompt


In [9]:
def augment_dataset(df, start_id=3221):
    augmented_data = []
    class_counts = df['Class'].value_counts().sort_values(ascending=True)
    print("Original class distribution:")
    print(class_counts)

    target_multipliers = {
        8: 20, 9: 10, 3: 5, 5: 3, 6: 2,
        2: 1.5, 1: 1.2, 4: 1.1, 7: 1
    }

    current_generated_id = start_id

    for class_id, multiplier in target_multipliers.items():
        # Get original samples for this class
        class_df = df[df['Class'] == class_id]
        original_count = len(class_df)

        # Always keep original samples
        augmented_data.append(class_df)

        # Calculate target including original samples
        target_count = min(int(original_count * multiplier), 1000)
        needed = max(0, target_count - original_count)

        if needed == 0:
            print(f"Class {class_id} already sufficient: {original_count} samples")
            continue

        print(f"Augmenting class {class_id} from {original_count} to {target_count} (+{needed})")
        generated_count = 0
        attempts = 0
        max_attempts = needed * 2

        while generated_count < needed and attempts < max_attempts:
            sample = class_df.sample(1)
            prompt = (
                f"Generate clinical text about {sample['Gene'].values[0]} "
                f"{sample['Variation'].values[0]} for class {class_id}. "
                f"Context: {sample['Text'].values[0][:500]}"
            )

            generated_text = generate_text(prompt)

            if generated_text and generated_text != prompt:
                new_entry = sample.copy()
                new_entry['Text'] = generated_text
                new_entry['Generated_ID'] = current_generated_id
                current_generated_id += 1
                augmented_data.append(new_entry)
                generated_count += 1

            attempts += 1

        print(f"Added {generated_count} new samples for class {class_id} (Total: {original_count + generated_count})")

    final_df = pd.concat(augmented_data)
    print("\nFinal class distribution:")
    print(final_df['Class'].value_counts().sort_values(ascending=True))
    return final_df

In [10]:
# Augment the dataset while preserving gene-variation relationships
augmented_train = augment_dataset(train)



Original class distribution:
Class
8     19
9     37
3     89
5    242
6    275
2    452
1    568
4    686
7    953
Name: count, dtype: int64
Augmenting class 8 from 19 to 380 (+361)


model.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Added 361 new samples for class 8 (Total: 380)
Augmenting class 9 from 37 to 370 (+333)
Added 333 new samples for class 9 (Total: 370)
Augmenting class 3 from 89 to 445 (+356)
Added 356 new samples for class 3 (Total: 445)
Augmenting class 5 from 242 to 726 (+484)
Added 484 new samples for class 5 (Total: 726)
Augmenting class 6 from 275 to 550 (+275)
Added 275 new samples for class 6 (Total: 550)
Augmenting class 2 from 452 to 678 (+226)
Added 226 new samples for class 2 (Total: 678)
Augmenting class 1 from 568 to 681 (+113)
Added 113 new samples for class 1 (Total: 681)
Augmenting class 4 from 686 to 754 (+68)
Added 68 new samples for class 4 (Total: 754)
Class 7 already sufficient: 953 samples

Final class distribution:
Class
9    370
8    380
3    445
6    550
2    678
1    681
5    726
4    754
7    953
Name: count, dtype: int64


In [11]:

# Split the augmented data

# Split dataset into training and validation sets (80-20 split)
augment_texts, valaug_texts, augment_labels, valaug_labels = train_test_split(
    augmented_train['Text'],
    augmented_train['Class'],
    test_size=0.2,
    stratify=augmented_train['Class'],
    random_state=42
)


In [12]:
# Load BioBERT tokenizer and model
model_name = 'monologg/biobert_v1.1_pubmed'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Add missing terms to tokenizer
missing_terms = ["dna", "fig", "missense", "mutants", "supplementary"]
tokenizer.add_tokens(missing_terms)

pretrained_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=9).to(device)
pretrained_model.resize_token_embeddings(len(tokenizer))




tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/biobert_v1.1_pubmed and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(29001, 768, padding_idx=0)

# gradual unfreezing

In [13]:

# Unfreeze last few layers for fine-tuning
def unfreeze_layers1(model, num_epochs, freeze_at_epoch=7):
    for name, param in model.named_parameters():
        # Initially freeze all layers
        param.requires_grad = False

    # Unfreeze layers gradually after certain epochs
    if num_epochs > freeze_at_epoch:
        for name, param in model.named_parameters():
            if 'encoder.layer.10' in name or 'encoder.layer.11' in name:  # Example for unfreezing last layers
                param.requires_grad = True

    return model


In [14]:
import torch.nn as nn # Import the nn module from PyTorch
import torch.nn.functional as F
# Weighted Focal Loss
class WeightedFocalLoss(nn.Module):
    def __init__(self, alpha=0.5, gamma=2, device='cuda'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.device = device

    def forward(self, inputs, targets):
        targets = targets.to(self.device)
        ce_loss = F.cross_entropy(inputs, targets.long(), reduction="none")  # Ensure targets are long
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        weighted_loss = focal_loss * class_weights[targets]
        return weighted_loss.mean()

# Define loss function
loss_fn = WeightedFocalLoss(alpha=0.5, gamma=3, device=device)

In [15]:
# Custom Trainer class
class CustomLossTrainer(Trainer):
    def __init__(self, *args, loss_fn=None, **kwargs):
        super().__init__(*args, **kwargs)
        if loss_fn is None:
            raise ValueError("A custom loss function must be provided!")
        self.loss_fn = loss_fn
        self.label_names = ["labels"]
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [16]:
# Dataset class
class MutationDataset(Dataset):
    def __init__(self, texts, labels):
        # Convert texts to a list of strings if it's not already
        if not isinstance(texts, list):
            texts = texts.tolist()  # Convert Series or other objects to a list

        self.encodings = tokenizer(
            texts, truncation=True, padding="max_length", max_length=512, return_tensors='pt', return_token_type_ids=False,
        )
        # Convert 1-9 to 0-8 for each label in the list using list comprehension
        self.labels = [label - 1 for label in labels]


    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx].clone().detach().to(device),
            'attention_mask': self.encodings['attention_mask'][idx].clone().detach().to(device),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long).to(device)  # Ensure dtype is long
        }

    def __len__(self):
        return len(self.labels)

# Create train and validation datasets
train_dataset = MutationDataset(augment_texts, augment_labels)
val_dataset = MutationDataset(valaug_texts, valaug_labels)


In [17]:

# Compute class weights
labels_array = np.array(augment_labels) - 1  # Convert train_labels to a NumPy array before subtraction
classes = np.unique(labels_array)
cw = compute_class_weight('balanced', classes=classes, y=labels_array)
class_weights = torch.tensor(cw, dtype=torch.float).to(device)
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=11,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    fp16=True,
    logging_dir='./logs',
    report_to='none',
    dataloader_pin_memory=False
)

# Reload model for each fold and unfreeze layers gradually
pretrained_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=9).to(device)
pretrained_model.resize_token_embeddings(len(tokenizer))

#Unfreeze layers after certain epochs
pretrained_model = unfreeze_layers1(pretrained_model, num_epochs=30)


# Trainer
trainer = CustomLossTrainer(
    model=pretrained_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1)),
        'f1': f1_score(p.label_ids, p.predictions.argmax(-1), average='weighted'),
        'precision': precision_score(p.label_ids, p.predictions.argmax(-1), average='weighted'),
        'recall': recall_score(p.label_ids, p.predictions.argmax(-1), average='weighted')
    },
    loss_fn=loss_fn
)

# Train & Evaluate
trainer.train()
results = trainer.evaluate()
print("Validation Results:", results)


# Save the model weights and configuration
trainer.save_model('./results')

# In your training script after tokenizer modifications
tokenizer.save_pretrained('./results')  # Save modified tokenizer with custom tokens



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/biobert_v1.1_pubmed and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6872,0.377788,0.5713,0.566471,0.692259,0.5713
2,0.2961,0.201473,0.703971,0.709154,0.761104,0.703971
3,0.2083,0.181224,0.725632,0.729788,0.788221,0.725632
4,0.1708,0.162237,0.738267,0.738758,0.74761,0.738267
5,0.1661,0.153181,0.727437,0.730488,0.745248,0.727437
6,0.1404,0.149888,0.74639,0.750417,0.771452,0.74639
7,0.1375,0.149639,0.741877,0.742293,0.745005,0.741877
8,0.1226,0.143346,0.759025,0.760927,0.772775,0.759025
9,0.1182,0.141719,0.752708,0.753052,0.753827,0.752708
10,0.1019,0.143948,0.74278,0.742726,0.745778,0.74278


Validation Results: {'eval_loss': 0.14270511269569397, 'eval_accuracy': 0.7527075812274369, 'eval_f1': 0.7538164420705623, 'eval_precision': 0.7578652657678782, 'eval_recall': 0.7527075812274369, 'eval_runtime': 8.4098, 'eval_samples_per_second': 131.75, 'eval_steps_per_second': 16.528, 'epoch': 11.0}


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.txt',
 './results/added_tokens.json')

In [18]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from tqdm import tqdm
import gc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load FINE-TUNED model and tokenizer with clear naming
model_path = './results'
bert_tokenizer = BertTokenizer.from_pretrained(model_path)
bert_model = BertForSequenceClassification.from_pretrained(model_path).to(device)

def clear_memory():
    torch.cuda.empty_cache()
    gc.collect()

def get_embeddings_batched(texts, batch_size=16):
    """Get [CLS] embeddings using fine-tuned BERT"""
    bert_model.eval()  # Set to evaluation mode
    embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i+batch_size]

            # Tokenize with BERT tokenizer
            inputs = bert_tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            ).to(device)

            # Get hidden states from BERT base
            outputs = bert_model.bert(**inputs)

            # Extract CLS embeddings
            cls_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_emb)

            # Clean memory
            del inputs, outputs
            clear_memory()

    return np.vstack(embeddings)

# Usage
clear_memory()

In [20]:
from sklearn.preprocessing import LabelEncoder
import joblib
# Label encode categorical features
le_gene = LabelEncoder()
le_variation = LabelEncoder()


augmented_train['Gene_encoded'] = le_gene.fit_transform(augmented_train['Gene'])
joblib.dump(le_gene, 'gene_label_encoder.pkl')  # Save for later

augmented_train['Variation_encoded'] = le_variation.fit_transform(augmented_train['Variation'])
joblib.dump(le_variation, 'variation_label_encoder.pkl')  # Save for later

['variation_label_encoder.pkl']

In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import xgboost as xgb
from imblearn.over_sampling import SMOTE

# Ensure classes are 0-based
augmented_train['Class'] = augmented_train['Class'] - 1  # If classes start at 1

# Generate embeddings for the entire augmented dataset
all_embeddings = get_embeddings_batched(augmented_train['Text'].tolist(), batch_size=16)

# Now create X_all using the entire augmented_train DataFrame
X_all = np.hstack([all_embeddings, augmented_train[['Gene_encoded', 'Variation_encoded']].values])
y_all = augmented_train['Class'].values # Use values to get a NumPy array




100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [13:52<00:00,  2.40s/it]


In [22]:
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score, f1_score

# Assuming X_all and y_all are your features and labels
X = X_all  # Shape (5000, n_features)
y = y_all  # Shape (5000,)

# ========== CRUCIAL FIX ==========
# Validate and remap labels to 0-based indexing
unique_classes = np.unique(y)
print(f"Original classes: {unique_classes}")

# Create label mapping (e.g., [1,2,3] -> [0,1,2])
label_mapping = {orig: new for new, orig in enumerate(unique_classes)}
y_remapped = np.vectorize(label_mapping.get)(y)

# Verify new labels
print(f"Remapped classes: {np.unique(y_remapped)}")
num_classes = len(unique_classes)
# =================================

# Initialize components
scaler = StandardScaler()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}")

    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_remapped[train_idx], y_remapped[val_idx]  # Use remapped labels

    # Scale features
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Create DMatrix objects
    dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
    dval = xgb.DMatrix(X_val_scaled, label=y_val)

    # XGBoost parameters (MUST match actual class count)
    params = {
        'objective': 'multi:softprob',
        'num_class': num_classes,  # Use actual count from data
        'tree_method': 'hist',
        'max_depth': 4,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 42
    }

    # Train model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=200,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=20,
        verbose_eval=50
    )

    # Predictions
    y_pred_proba = model.predict(dval)
    y_pred = np.argmax(y_pred_proba, axis=1)

    # Calculate metrics
    fold_metrics = {
        'log_loss': log_loss(y_val, y_pred_proba),
        'accuracy': accuracy_score(y_val, y_pred),
        'f1_macro': f1_score(y_val, y_pred, average='macro'),
        'f1_weighted': f1_score(y_val, y_pred, average='weighted')
    }

    print(f"\nFold {fold + 1} Evaluation:")
    for metric, value in fold_metrics.items():
        print(f"{metric}: {value:.4f}")

    fold_results.append(fold_metrics)

# Cross-validation summary
print("\n" + "=" * 40)
print("Final Cross-Validation Results:")
print("=" * 40)
for metric in ['log_loss', 'accuracy', 'f1_macro', 'f1_weighted']:
    scores = [res[metric] for res in fold_results]
    print(f"{metric:15}: {np.mean(scores):.4f} ¬± {np.std(scores):.4f}")

Original classes: [0 1 2 3 4 5 6 7 8]
Remapped classes: [0 1 2 3 4 5 6 7 8]

Fold 1
[0]	train-mlogloss:1.89245	val-mlogloss:1.89912
[50]	train-mlogloss:0.30775	val-mlogloss:0.50783
[75]	train-mlogloss:0.25517	val-mlogloss:0.51194

Fold 1 Evaluation:
log_loss: 0.5119
accuracy: 0.8204
f1_macro: 0.8389
f1_weighted: 0.8212

Fold 2
[0]	train-mlogloss:1.89079	val-mlogloss:1.90653
[50]	train-mlogloss:0.29753	val-mlogloss:0.57069
[71]	train-mlogloss:0.25155	val-mlogloss:0.57788

Fold 2 Evaluation:
log_loss: 0.5786
accuracy: 0.7879
f1_macro: 0.8129
f1_weighted: 0.7886

Fold 3
[0]	train-mlogloss:1.88996	val-mlogloss:1.90108
[50]	train-mlogloss:0.30626	val-mlogloss:0.52410
[76]	train-mlogloss:0.25433	val-mlogloss:0.53109

Fold 3 Evaluation:
log_loss: 0.5311
accuracy: 0.8220
f1_macro: 0.8454
f1_weighted: 0.8239

Fold 4
[0]	train-mlogloss:1.88999	val-mlogloss:1.91007
[50]	train-mlogloss:0.30345	val-mlogloss:0.53461
[81]	train-mlogloss:0.24139	val-mlogloss:0.53361

Fold 4 Evaluation:
log_loss: 0.533

In [23]:
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import pickle

# Assuming X_all and y_all are your features and labels
X = X_all  # Shape (5000, n_features)
y = y_all  # Shape (5000,)

# ========== Label Remapping ==========
unique_classes = np.unique(y)
print(f"Original classes: {unique_classes}")

# Create and apply label mapping
label_mapping = {orig: new for new, orig in enumerate(unique_classes)}
y_remapped = np.vectorize(label_mapping.get)(y)
print(f"Remapped classes: {np.unique(y_remapped)}")
num_classes = len(unique_classes)

# ========== Feature Scaling ==========
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ========== Full Dataset Training ==========
# Create DMatrix for full dataset
dtrain = xgb.DMatrix(X_scaled, label=y_remapped)

# Parameters (keep same as before)
params = {
    'objective': 'multi:softprob',
    'num_class': num_classes,
    'tree_method': 'hist',
    'max_depth': 4,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

# Train final model on all data
final_model = xgb.train(
    params,
    dtrain,
    num_boost_round=200  # Use optimal rounds from CV if available
)

# ========== Save Components (JSON version) ==========
# Save the trained model in JSON format
final_model.save_model('xgb_model.json')  # This saves as JSON by default when using .json extension

# Save the scaler and mapping using joblib (better than pickle for sklearn objects)
from joblib import dump

dump(scaler, 'standard_scaler.joblib')
dump(label_mapping, 'label_mapping.joblib')

print("Training completed and components saved in JSON/Joblib format!")

Original classes: [0 1 2 3 4 5 6 7 8]
Remapped classes: [0 1 2 3 4 5 6 7 8]
Training completed and components saved in JSON/Joblib format!


In [26]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import xgboost as xgb
import joblib
from transformers import BertTokenizer  # Added missing import


# === Main Prediction Pipeline ===
# Load data
test_variants = pd.read_csv("test_variants.csv")
test_text = pd.read_csv("test_text.csv", sep='\|\|', names=['ID', 'Text'], skiprows=1)
test = pd.merge(test_variants, test_text, on='ID')

# === FIX: Remove rows with invalid 'Text' values ===
# Keep only rows where 'Text' is a non-empty string
test = test[test['Text'].apply(lambda x: isinstance(x, str) and x.strip() != "")]

# Generate BERT embeddings
# Generate BERT embeddings
text_embeddings = get_embeddings_batched(test['Text'].tolist())



# Load encoders and encode features
le_gene = joblib.load('gene_label_encoder.pkl')
le_variation = joblib.load('variation_label_encoder.pkl')

test['Gene_encoded'] = test['Gene'].apply(lambda x: le_gene.transform([x])[0] if x in le_gene.classes_ else -1)
test['Variation_encoded'] = test['Variation'].apply(lambda x: le_variation.transform([x])[0] if x in le_variation.classes_ else -1)

# Combine features
X_test = np.hstack([
    text_embeddings,
    test[['Gene_encoded', 'Variation_encoded']].values
])

# Apply scaling
scaler = joblib.load('standard_scaler.joblib')
X_test_scaled = scaler.transform(X_test)

# Make predictions
xgb_model = xgb.Booster()
xgb_model.load_model('xgb_model.json')
dtest = xgb.DMatrix(X_test_scaled)
test_probs = xgb_model.predict(dtest)

# Re-map probabilities to original class order
label_mapping = joblib.load('label_mapping.joblib')
original_classes = sorted(label_mapping.keys())
submission_probs = test_probs[:, np.argsort(original_classes)]

# Create submission
submission = pd.DataFrame({
    'ID': test['ID'],
    **{f'Class{cls}': submission_probs[:, i]
       for i, cls in enumerate(sorted(original_classes))}
})
submission.to_csv('submission.csv', index=False)



  test_text = pd.read_csv("test_text.csv", sep='\|\|', names=['ID', 'Text'], skiprows=1)
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 355/355 [19:45<00:00,  3.34s/it]


In [1]:
!nvidia-smi

Sun Feb 23 21:19:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                