In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv("merge.csv")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Columns: 57741 entries, Unnamed: 0 to cancer
dtypes: int64(57737), object(4)
memory usage: 125.6+ MB


In [9]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,...,ENSG00000273486,ENSG00000273487,ENSG00000273488,ENSG00000273489,ENSG00000273492,ENSG00000273493,patient_id,cancer_type,mutational_subclass,cancer
0,3-Breast-Her2-ampl,0,0,44,26,81,171,34,83,0,...,1,0,0,1,4,0,patient id: Breast-03,cancer type: Breast,mutational subclass: HER2+,1
1,8-Breast-WT,0,0,14,1,98,99,0,107,0,...,0,0,0,0,0,0,patient id: Breast-08,cancer type: Breast,mutational subclass: wt,1
2,10-Breast-Her2-ampl,0,0,16,14,18,25,0,62,4,...,0,0,0,0,0,0,patient id: Breast-10,cancer type: Breast,mutational subclass: HER2+,1
3,Breast-100,0,0,8,0,17,4,0,19,0,...,0,0,0,0,0,0,patient id: Breast-100,cancer type: Breast,mutational subclass: Triple Negative,1
4,15-Breast-Her2-ampl,17,0,9,4,0,49,0,40,0,...,0,0,0,0,0,0,patient id: Breast-15,cancer type: Breast,mutational subclass: HER2+,1


In [15]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Load dataset
df = pd.read_csv("merge.csv")

# Inspect dataset
print("Dataset Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())

# Check association between cancer_type and mutational_subclass
print("\nChecking association between cancer_type and mutational_subclass:")
contingency_table = pd.crosstab(df['cancer_type'], df['mutational_subclass'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-squared test p-value: {p}")
if p < 0.05:
    print("Strong association detected (p < 0.05). Excluding mutational_subclass to avoid leakage.")
else:
    print("Weak association (p >= 0.05). Mutational_subclass may be a useful feature.")

# Define columns to drop (non-feature columns)
columns_to_drop = ['Unnamed: 0', 'patient_id', 'cancer', 'cancer_type', 'mutational_subclass']
# Note: 'mutational_subclass' is dropped by default to avoid leakage
# Uncomment the section below to include it as a feature

# Features = all columns except non-feature columns
X = df.drop(columns=columns_to_drop)
y = df["cancer_type"]

# Optionally include mutational_subclass as a feature (uncomment if desired)
"""
le_mut = LabelEncoder()
X['mutational_subclass'] = le_mut.fit_transform(df['mutational_subclass'])
columns_to_drop.remove('mutational_subclass')  # Keep it in X
X = df.drop(columns=columns_to_drop)  # Redefine X to include mutational_subclass
print("Included mutational_subclass as a feature (encoded).")
"""

# Verify all features are numeric
non_numeric_cols = X.select_dtypes(include=['object', 'category']).columns
if len(non_numeric_cols) > 0:
    raise ValueError(f"Non-numeric feature columns detected: {non_numeric_cols}. Please encode or drop them.")

# Encode target labels (cancer_type)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Stratified split (preserves class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Feature selection (pick top 500 features, or fewer if dataset has less)
k = min(500, X.shape[1])  # Ensure k doesn't exceed number of features
selector = SelectKBest(f_classif, k=k)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

# Define classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(multi_class='multinomial', max_iter=1000, random_state=42),
    "SVM": SVC(kernel='linear', random_state=42)
}

# Train, evaluate, and collect F1-scores for comparison
f1_scores = {}
for name, clf in classifiers.items():
    print(f"\n### Evaluating {name} ###")
    
    # Train classifier
    clf.fit(X_train, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test)
    
    # Evaluation
    print(f"Classification Report for {name}:\n", 
          classification_report(y_test, y_pred, target_names=le.classes_))
    print(f"Confusion Matrix for {name}:\n", 
          confusion_matrix(y_test, y_pred))
    
    # Store weighted F1-score
    f1_scores[name] = f1_score(y_test, y_pred, average='weighted')

# Print F1-scores for comparison
print("\nWeighted F1-Scores:")
for name, score in f1_scores.items():
    print(f"{name}: {score:.4f}")

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Columns: 57741 entries, Unnamed: 0 to cancer
dtypes: int64(57737), object(4)
memory usage: 125.6+ MB
None

First few rows:
            Unnamed: 0  ENSG00000000003  ENSG00000000005  ENSG00000000419  \
0   3-Breast-Her2-ampl                0                0               44   
1          8-Breast-WT                0                0               14   
2  10-Breast-Her2-ampl                0                0               16   
3           Breast-100                0                0                8   
4  15-Breast-Her2-ampl               17                0                9   

   ENSG00000000457  ENSG00000000460  ENSG00000000938  ENSG00000000971  \
0               26               81              171               34   
1                1               98               99                0   
2               14               18               25                0   
3                0               17 

  f = msb / msw



### Evaluating Random Forest ###


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for Random Forest:
                             precision    recall  f1-score   support

       cancer type: Breast       0.67      0.25      0.36         8
          cancer type: CRC       0.21      0.38      0.27         8
          cancer type: GBM       0.29      0.25      0.27         8
           cancer type: HC       0.75      0.55      0.63        11
cancer type: Hepatobiliary       0.00      0.00      0.00         3
         cancer type: Lung       0.35      0.50      0.41        12
     cancer type: Pancreas       0.25      0.29      0.27         7

                  accuracy                           0.37        57
                 macro avg       0.36      0.32      0.32        57
              weighted avg       0.41      0.37      0.37        57

Confusion Matrix for Random Forest:
 [[2 3 0 0 0 1 2]
 [0 3 0 0 0 2 3]
 [0 3 2 1 0 2 0]
 [0 0 1 6 0 4 0]
 [0 0 1 1 0 1 0]
 [1 3 1 0 0 6 1]
 [0 2 2 0 0 1 2]]

### Evaluating KNN ###


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for KNN:
                             precision    recall  f1-score   support

       cancer type: Breast       0.22      0.50      0.31         8
          cancer type: CRC       0.27      0.38      0.32         8
          cancer type: GBM       0.50      0.25      0.33         8
           cancer type: HC       0.80      0.73      0.76        11
cancer type: Hepatobiliary       0.00      0.00      0.00         3
         cancer type: Lung       0.33      0.17      0.22        12
     cancer type: Pancreas       0.25      0.29      0.27         7

                  accuracy                           0.37        57
                 macro avg       0.34      0.33      0.32        57
              weighted avg       0.39      0.37      0.36        57

Confusion Matrix for KNN:
 [[4 2 0 0 0 0 2]
 [3 3 0 0 0 1 1]
 [2 2 2 1 0 0 1]
 [1 0 0 8 0 1 1]
 [0 0 0 1 0 1 1]
 [6 2 2 0 0 2 0]
 [2 2 0 0 0 1 2]]

### Evaluating Logistic Regression ###
Classification Report for Logi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("merge.csv")

# Inspect dataset
print("Dataset Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())

# Check association between cancer_type and mutational_subclass
print("\nChecking association between cancer_type and mutational_subclass:")
contingency_table = pd.crosstab(df['cancer_type'], df['mutational_subclass'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-squared test p-value: {p}")
if p < 0.05:
    print("Strong association detected (p < 0.05). Excluding mutational_subclass to avoid leakage.")
else:
    print("Weak association (p >= 0.05). Mutational_subclass may be a useful feature.")

# Define columns to drop (non-feature columns)
columns_to_drop = ['Unnamed: 0', 'patient_id', 'cancer', 'cancer_type', 'mutational_subclass']

# Features = all columns except non-feature columns
X = df.drop(columns=columns_to_drop)
y = df["cancer_type"]

# Remove constant features
var_filter = VarianceThreshold(threshold=0.0)
X = var_filter.fit_transform(X)
print(f"Removed {len(var_filter.get_support()) - sum(var_filter.get_support())} constant features.")

# Verify all features are numeric
non_numeric_cols = pd.DataFrame(X).select_dtypes(include=['object', 'category']).columns
if len(non_numeric_cols) > 0:
    raise ValueError(f"Non-numeric feature columns detected: {non_numeric_cols}. Please encode or drop them.")

# Encode target labels (cancer_type)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Stratified split (preserves class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42, k_neighbors=1)  # Reduced k_neighbors for small classes
X_train, y_train = smote.fit_resample(X_train, y_train)
print(f"After SMOTE, training set size: {X_train.shape}")

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Feature selection (pick top 200 features to balance informativeness and noise)
k = min(200, X.shape[1])
selector = SelectKBest(f_classif, k=k)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

# Define classifiers with class weights
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced'),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(multi_class='multinomial', max_iter=1000, random_state=42, class_weight='balanced'),
    "SVM": SVC(kernel='linear', random_state=42, class_weight='balanced')
}

# Train, evaluate, and collect F1-scores
f1_scores = {}
for name, clf in classifiers.items():
    print(f"\n### Evaluating {name} ###")
    
    # Train classifier
    clf.fit(X_train, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test)
    
    # Evaluation
    print(f"Classification Report for {name}:\n", 
          classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))
    print(f"Confusion Matrix for {name}:\n", 
          confusion_matrix(y_test, y_pred))
    
    # Store weighted F1-score
    f1_scores[name] = f1_score(y_test, y_pred, average='weighted')

# Print F1-scores for comparison
print("\nWeighted F1-Scores:")
for name, score in f1_scores.items():
    print(f"{name}: {score:.4f}")

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Columns: 57741 entries, Unnamed: 0 to cancer
dtypes: int64(57737), object(4)
memory usage: 125.6+ MB
None

First few rows:
            Unnamed: 0  ENSG00000000003  ENSG00000000005  ENSG00000000419  \
0   3-Breast-Her2-ampl                0                0               44   
1          8-Breast-WT                0                0               14   
2  10-Breast-Her2-ampl                0                0               16   
3           Breast-100                0                0                8   
4  15-Breast-Her2-ampl               17                0                9   

   ENSG00000000457  ENSG00000000460  ENSG00000000938  ENSG00000000971  \
0               26               81              171               34   
1                1               98               99                0   
2               14               18               25                0   
3                0               17 

  1287  1344  1461  1504  1560  1805  2023  2253  2257  2518  2535  2740
  2982  3169  3194  3358  3416  3547  3591  3948  3991  4138  4289  4359
  4497  4581  4751  4920  5100  5107  5167  5168  5176  5194  5221  5239
  5302  5365  5369  5415  5542  5611  5624  5660  5714  5796  6081  6255
  6565  6853  7037  7088  7118  7143  7152  7262  7316  7426  7672  7680
  7689  7700  7758  7788  7887  8062  8197  8300  8395  8396  8533  8599
  8664  8665  8833  8884  8949  9019  9174  9279  9499  9525  9533  9569
  9788  9853 10265 10301 10333 10473 10514 10569 10794 10840 10994 11029
 11044 11058 11107 11122 11143 11215 11340 11498 11509 11528 11547 11556
 11608 11624 11704 11778 11810 12020 12120 12162 12177 12196 12244 12298
 12349 12379 12403 12423 12452 12525 12536 12584 12672 12729 12768 12795
 12815 12852 12889 12890 12952 13009 13131 13219 13283 13333 13353 13357
 13368 13392 13434 13488 13525 13533 13610 13614 13666 13739 13741 13771
 13820 13846 13867 13880 13957 14178 14231 14244 14


### Evaluating Random Forest ###
Classification Report for Random Forest:
                             precision    recall  f1-score   support

       cancer type: Breast       0.40      0.25      0.31         8
          cancer type: CRC       0.33      0.62      0.43         8
          cancer type: GBM       0.25      0.25      0.25         8
           cancer type: HC       0.75      0.55      0.63        11
cancer type: Hepatobiliary       0.00      0.00      0.00         3
         cancer type: Lung       0.23      0.25      0.24        12
     cancer type: Pancreas       0.29      0.29      0.29         7

                  accuracy                           0.35        57
                 macro avg       0.32      0.32      0.31        57
              weighted avg       0.37      0.35      0.35        57

Confusion Matrix for Random Forest:
 [[2 2 1 0 0 0 3]
 [0 5 0 0 0 2 1]
 [0 2 2 1 0 2 1]
 [0 0 1 6 0 4 0]
 [0 0 1 1 0 1 0]
 [3 4 2 0 0 3 0]
 [0 2 1 0 1 1 2]]

### Evaluating 



Classification Report for Logistic Regression:
                             precision    recall  f1-score   support

       cancer type: Breast       0.33      0.38      0.35         8
          cancer type: CRC       0.42      0.62      0.50         8
          cancer type: GBM       0.62      0.62      0.62         8
           cancer type: HC       0.67      0.55      0.60        11
cancer type: Hepatobiliary       0.00      0.00      0.00         3
         cancer type: Lung       0.33      0.33      0.33        12
     cancer type: Pancreas       0.57      0.57      0.57         7

                  accuracy                           0.47        57
                 macro avg       0.42      0.44      0.43        57
              weighted avg       0.46      0.47      0.46        57

Confusion Matrix for Logistic Regression:
 [[3 2 0 0 0 0 3]
 [1 5 0 0 0 2 0]
 [1 0 5 0 0 2 0]
 [0 1 2 6 0 2 0]
 [0 0 0 2 0 1 0]
 [4 2 1 1 0 4 0]
 [0 2 0 0 0 1 4]]

### Evaluating SVM ###
Classification