<a href="https://colab.research.google.com/github/sj-minRva/Cancer-Classification/blob/mybranch-Rhea/XGB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import numpy as np

# Path to CSV created by R.
csv_path = r"/content/BRCA_assay_1.csv"

if not os.path.exists(csv_path):
    raise FileNotFoundError(f"CSV not found: {csv_path}")

df = pd.read_csv(csv_path, index_col=0)  # we wrote row.names = TRUE in R, so index_col=0 is often desired
print("Loaded CSV shape:", df.shape)
print("Columns:", df.columns.tolist()[:20])

# --- Infer target column ---
possible_targets = [c for c in df.columns if c.lower() in ("label", "target", "y", "class", "phenotype", "status")]
if len(possible_targets) >= 1:
    target_col = possible_targets[0]
    print(f"Auto-detected target column: {target_col}")
else:
    # fallback: use last column as target (common in some exports). Change manually if incorrect.
    target_col = df.columns[-1]
    print(f"No common target name detected. Using last column as target: {target_col} (change manually if wrong)")

# Optionally inspect unique values of target
print("Target value counts:\n", df[target_col].value_counts(dropna=False))

# --- Prepare X and y ---
# If your features are genes (rows=genes?), ensure df is samples x features. If not, transpose:
# If number of rows << number of columns and rownames are genes, you may need: df = df.T
# Quick heuristic: if index looks like gene names ("TP53", "BRCA1") then probably you need to transpose:
def looks_like_genes(index):
    # simple heuristic: if most index entries contain letters and digits and are short tokens
    return np.mean([any(c.isalpha() for c in str(x)) for x in index]) > 0.7 and len(index) > 50

if looks_like_genes(df.index):
    print("Heuristic suggests rows are features (genes). Transposing to make samples = rows.")
    df = df.T
    print("New shape after transpose:", df.shape)

# Re-evaluate target existence after transpose
if target_col not in df.columns:
    # try to find it among the new columns or index
    print("After possible transpose, target column not found. Searching again...")
    poss = [c for c in df.columns if c.lower() in ("label","target","y","class","phenotype","status")]
    if poss:
        target_col = poss[0]
        print("Found target:", target_col)
    else:
        # if target is stored in the original rownames, user intervention may be required
        print("Warning: target column still not found. You must set 'target_col' variable manually to the correct column name.")
        print("Available columns:", df.columns.tolist()[:50])
        raise SystemExit("Stop: target column not found automatically. Edit the script to set target_col correctly.")

# Drop rows with missing target
df = df.dropna(subset=[target_col])

X = df.drop(columns=[target_col])
y = df[target_col].copy()

# Convert non-numeric columns (features) with LabelEncoder if few unique values; otherwise attempt numeric conversion
for col in X.columns:
    if X[col].dtype == object or X[col].dtype.name == 'category':
        # if low cardinality treat as categorical label
        if X[col].nunique() <= 20:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
        else:
            # attempt to coerce to numeric
            X[col] = pd.to_numeric(X[col], errors='coerce')

# Convert y if categorical
if y.dtype == object or y.dtype.name == 'category':
    y = LabelEncoder().fit_transform(y.astype(str))
else:
    # if numeric but floats like 0.0/1.0 convert to ints
    if pd.api.types.is_float_dtype(y):
        if set(y.dropna().unique()).issubset({0.0,1.0}):
            y = y.astype(int)

# Fill remaining missing values in X (simple strategy)
if X.isnull().any().any():
    print("Filling missing values with column medians (simple strategy).")
    X = X.fillna(X.median(numeric_only=True))

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if len(np.unique(y))>1 else None)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# Train a basic XGBoost classifier
model = XGBClassifier(n_estimators=100, max_depth=6, use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {acc:.4f}")
print(classification_report(y_test, y_pred))

# Feature importance (top 20)
importances = model.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print("Top features:\n", feat_imp.head(20))


Loaded CSV shape: (2700, 1196)
Columns: ['TCGA-A8-A06Z-01A-11R-A00Z-07', 'TCGA-A8-A08F-01A-11R-A00Z-07', 'TCGA-BH-A0DZ-01A-11R-A00Z-07', 'TCGA-AN-A03Y-01A-21R-A00Z-07', 'TCGA-A8-A09A-01A-11R-A00Z-07', 'TCGA-AN-A0AK-01A-21R-A00Z-07', 'TCGA-A8-A06N-01A-11R-A00Z-07', 'TCGA-A8-A07P-01A-11R-A00Z-07', 'TCGA-A7-A0DB-01A-11R-A00Z-07', 'TCGA-A8-A06Y-01A-21R-A00Z-07', 'TCGA-A8-A07I-01A-11R-A00Z-07', 'TCGA-A8-A094-01A-11R-A00Z-07', 'TCGA-A7-A0CJ-01A-21R-A00Z-07', 'TCGA-BH-A0BV-01A-11R-A00Z-07', 'TCGA-A8-A0A7-01A-11R-A00Z-07', 'TCGA-A8-A08H-01A-21R-A00Z-07', 'TCGA-A8-A07L-01A-11R-A00Z-07', 'TCGA-A8-A08I-01A-11R-A00Z-07', 'TCGA-A8-A086-01A-11R-A00Z-07', 'TCGA-A8-A08J-01A-11R-A00Z-07']
No common target name detected. Using last column as target: TCGA-BH-A0H9-11A-22R-A466-07 (change manually if wrong)
Target value counts:
 TCGA-BH-A0H9-11A-22R-A466-07
0.0       161
1.0        33
2.0        24
6.0        23
3.0        23
         ... 
174.0       1
812.0       1
2416.0      1
4585.0      1
694.0      

AttributeError: 'float' object has no attribute 'lower'