In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('../Dataframes/reduced_merge1_corr.csv')


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE

# =======================================================
# 1. Define features and new target (cancer_type)
# =======================================================
columns_to_drop = ['Unnamed: 0', 'cancer', 'cancer_type', 'mutational_subclass']

X = df.drop(columns=columns_to_drop)
y = df["mutational_subclass"]   # New target column

# Encode target (convert cancer_type → numeric labels)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print("Classes:", label_encoder.classes_)

# =======================================================
# 2. Remove constant features
# =======================================================
var_filter = VarianceThreshold(threshold=0.0)
X = var_filter.fit_transform(X)
print(f"Removed {len(var_filter.get_support()) - sum(var_filter.get_support())} constant features.")

# Verify all features are numeric
non_numeric_cols = pd.DataFrame(X).select_dtypes(include=['object', 'category']).columns
if len(non_numeric_cols) > 0:
    raise ValueError(f"Non-numeric feature columns detected: {non_numeric_cols}. Please encode or drop them.")

# =======================================================
# 3. Train/Test split (stratified by cancer_type)
# =======================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# =======================================================
# 4. Handle imbalance with SMOTE
# =======================================================
smote = SMOTE(random_state=42, k_neighbors=1)
X_train, y_train = smote.fit_resample(X_train, y_train)
print(f"After SMOTE, training set size: {X_train.shape}")

# =======================================================
# 5. Scale features
# =======================================================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# =======================================================
# 6. Feature selection
# =======================================================
k = min(200, X.shape[1])
selector = SelectKBest(f_classif, k=k)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

# =======================================================
# 7. Define classifiers
# =======================================================
classifiers = {
    "Random Forest": RandomForestClassifier(
        n_estimators=200, random_state=42, class_weight='balanced'
    ),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(
        multi_class='multinomial', max_iter=1000, random_state=42, class_weight='balanced'
    ),
    "SVM": SVC(kernel='linear', random_state=42, class_weight='balanced')
}

# =======================================================
# 8. Train & Evaluate
# =======================================================
f1_scores = {}
for name, clf in classifiers.items():
    print(f"\n### Evaluating {name} ###")

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(f"Classification Report for {name}:\n",
          classification_report(y_test, y_pred, zero_division=0, target_names=label_encoder.classes_))
    print(f"Confusion Matrix for {name}:\n",
          confusion_matrix(y_test, y_pred))

    f1_scores[name] = f1_score(y_test, y_pred, average='weighted')

# =======================================================
# 9. Compare F1 Scores
# =======================================================
print("\nWeighted F1-Scores:")
for name, score in f1_scores.items():
    print(f"{name}: {score:.4f}")


Classes: [' EGFR' ' EGFR, MET' ' HER2+' ' HER2+, PIK3CA' ' KRAS' ' KRAS, MET'
 ' MET' ' PIK3CA' ' Triple Negative' ' wt']
Removed 0 constant features.


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.