<a href="https://colab.research.google.com/github/sj-minRva/Cancer-Classification/blob/mybranch-Rhea/XGB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# 2B: No metadata available â€” infer sample type from TCGA barcodes and train XGBoost
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

expr_csv_path = r"/content/BRCA_assay_1.csv"   # genes x samples

expr = pd.read_csv(expr_csv_path, index_col=0)  # rows = genes, cols = samples (TCGA barcodes)
# if rows > cols, transpose
if expr.shape[0] > expr.shape[1]:
    expr = expr.T

# At this point expr: samples x features
expr.index = expr.index.astype(str).str.strip()

# function to parse TCGA barcode to get sample type code (first two chars of 4th group)
def get_sample_type_code(barcode):
    # safe parsing: split on '-' and attempt to take 4th group
    parts = barcode.split('-')
    if len(parts) >= 4 and len(parts[3]) >= 2:
        return parts[3][:2]
    # some barcodes may use '.' or be shorter, try other heuristics
    # fall back: search for a pattern like '01A' in barcode
    import re
    m = re.search(r'-(\d{2}[A-Z0-9])-', barcode)
    if m:
        return m.group(1)[:2]
    return None

# TCGA sample type code mapping (common codes)
sample_type_map = {
    '01': 'Primary_Tumor',
    '02': 'Recurrent_Tumor',
    '03': 'Primary_Blood_Derived_Tumor',
    '04': 'Metastatic',
    '05': 'Additional_Metastatic',
    '06': 'Metastatic_Recurrent',
    '07': 'Xenograft',
    '08': 'Cell_Line',
    '09': 'Primary_Microdissected',
    '10': 'Blood_Derived_Normal',
    '11': 'Solid_Tissue_Normal',
    '12': 'Buccal_Cell_Normal',
    # add more mappings if needed
}

# create metadata DataFrame
meta_inferred = pd.DataFrame(index=expr.index)
meta_inferred['sample_type_code'] = [get_sample_type_code(b) for b in expr.index]
meta_inferred['sample_type'] = meta_inferred['sample_type_code'].map(sample_type_map).fillna('Unknown')

print("Inferred sample types counts:\n", meta_inferred['sample_type'].value_counts())

# Option: collapse to binary Primary_Tumor vs Normal
meta_inferred['binary_label'] = meta_inferred['sample_type'].apply(lambda x: 'Primary_Tumor' if x == 'Primary_Tumor' else ('Normal' if x in ('Solid_Tissue_Normal','Blood_Derived_Normal') else 'Other'))

print("Binary counts:\n", meta_inferred['binary_label'].value_counts())

# Choose which label to use: 'sample_type' (multi) or 'binary_label' (recommended)
label_choice = 'binary_label'   # or 'sample_type'

y = meta_inferred[label_choice].copy()
# Filter to keep only Primary_Tumor vs Normal for cleaner classification if you want
keep = y.isin(['Primary_Tumor','Normal'])
expr = expr.loc[keep]
y = y.loc[keep]

print("Samples after filtering to Primary_Tumor/Normal:", expr.shape[0])

# encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Fill missing values
X = expr.fillna(expr.median(numeric_only=True))

# train/test split (stratify if possible)
strat = y_enc if len(np.unique(y_enc))>1 and min(np.bincount(y_enc))>=2 else None
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=strat)

# Train XGBoost
model = XGBClassifier(n_estimators=100, max_depth=6, use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


Inferred sample types counts:
 sample_type
Primary_Tumor           1076
Solid_Tissue_Normal      113
Metastatic_Recurrent       7
Name: count, dtype: int64
Binary counts:
 binary_label
Primary_Tumor    1076
Normal            113
Other               7
Name: count, dtype: int64
Samples after filtering to Primary_Tumor/Normal: 1189


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9831932773109243
               precision    recall  f1-score   support

       Normal       0.95      0.87      0.91        23
Primary_Tumor       0.99      1.00      0.99       215

     accuracy                           0.98       238
    macro avg       0.97      0.93      0.95       238
 weighted avg       0.98      0.98      0.98       238

