<a href="https://colab.research.google.com/github/sj-minRva/Cancer-Classification/blob/main/BRCA_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, cohen_kappa_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import shuffle

In [9]:
df = pd.read_csv("BRCA_gene_expression.csv", index_col="Unnamed: 0")
print(df.columns)

Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'FIRRM', 'FGR', 'CFH', 'FUCA2',
       'GCLC', 'NFYA',
       ...
       'C8orf44-SGK3', 'SNORA74C-2', 'ELOA3BP', 'NPBWR1', 'ELOA3DP', 'LNCDAT',
       'LOC124902537', 'RNF228', 'PANO1', 'classes'],
      dtype='object', length=31575)


In [5]:
le = LabelEncoder()
df["classes"] = le.fit_transform(df["classes"])

In [6]:
X = df.drop(columns=["classes"])
y = df["classes"]

In [10]:
# Check the counts of each class in y
class_counts = y.value_counts()
print("Class counts before dropping:")
print(class_counts)

# Find the class with only one member
single_sample_class = class_counts[class_counts == 1].index[0]

# Remove the row corresponding to the single sample class
df_filtered = df[df['classes'] != single_sample_class]

# Separate features and target again from the filtered data
X = df_filtered.drop(columns=["classes"])
y = df_filtered["classes"]

print("\nClass counts after dropping:")
print(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

Class counts before dropping:
classes
0    338
1     30
2      1
Name: count, dtype: int64

Class counts after dropping:
classes
Primary Tumor          1111
Solid Tissue Normal     113
Name: count, dtype: int64


In [11]:
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (856, 31574)
Testing set shape: (368, 31574)


In [12]:
df = df.iloc[:,1:]

df.dropna()


Unnamed: 0,TNMD,DPM1,SCYL3,FIRRM,FGR,CFH,FUCA2,GCLC,NFYA,STPG1,...,C8orf44-SGK3,SNORA74C-2,ELOA3BP,NPBWR1,ELOA3DP,LNCDAT,LOC124902537,RNF228,PANO1,classes
TCGA-A2-A25D-01A-12R-A16F-07,3.321928,10.802516,10.412570,8.761551,9.859535,11.256209,12.131857,11.154185,11.294621,9.082149,...,0.0,1.0,0.0,0.000000,0.0,0.000000,0.0,3.459432,4.459432,Primary Tumor
TCGA-BH-A201-01A-11R-A14M-07,4.954196,11.309476,10.865733,9.799282,9.475733,12.087794,11.087463,11.272047,11.890644,9.355351,...,0.0,1.0,0.0,1.584963,0.0,0.000000,0.0,3.169925,4.857981,Primary Tumor
TCGA-AC-A23C-01A-12R-A169-07,6.832890,12.538189,11.606868,10.055282,8.848623,12.170551,12.151017,11.145932,12.099677,10.318543,...,0.0,0.0,0.0,1.000000,0.0,0.000000,0.0,7.066089,5.000000,Primary Tumor
TCGA-AR-A5QP-01A-11R-A28M-07,5.672425,11.074141,10.405141,8.891784,7.894818,11.253257,11.149112,11.365229,10.903129,8.842350,...,1.0,0.0,0.0,1.584963,0.0,0.000000,0.0,2.584963,5.000000,Primary Tumor
TCGA-C8-A12P-01A-11R-A115-07,0.000000,11.626622,10.463524,9.317413,8.873444,12.127027,11.338179,10.645658,10.631177,8.980140,...,0.0,0.0,0.0,3.807355,0.0,3.169925,0.0,1.000000,3.807355,Primary Tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-LL-A5YP-01A-21R-A28M-07,3.000000,11.302068,9.493855,9.306062,8.942515,11.781770,10.662668,9.592457,11.096715,9.292322,...,0.0,0.0,0.0,1.000000,0.0,1.000000,0.0,5.643856,3.807355,Primary Tumor
TCGA-AO-A03L-01A-41R-A056-07,2.000000,11.658211,10.784635,9.813781,8.344296,11.393927,10.882643,10.271463,11.979782,8.622052,...,0.0,0.0,0.0,1.000000,0.0,0.000000,0.0,0.000000,5.781360,Primary Tumor
TCGA-BH-A42T-01A-11R-A24H-07,5.357552,11.174926,10.303781,9.821774,9.541097,10.075479,11.700873,10.409391,11.465056,9.002815,...,1.0,0.0,0.0,3.584963,0.0,0.000000,0.0,0.000000,4.754888,Primary Tumor
TCGA-A2-A04W-01A-31R-A115-07,1.000000,10.903129,9.405141,8.189825,7.813781,10.041659,11.118941,9.459432,10.312883,9.385862,...,0.0,0.0,0.0,0.000000,0.0,2.584963,0.0,3.321928,5.129283,Primary Tumor


In [13]:
df.head()

Unnamed: 0,TNMD,DPM1,SCYL3,FIRRM,FGR,CFH,FUCA2,GCLC,NFYA,STPG1,...,C8orf44-SGK3,SNORA74C-2,ELOA3BP,NPBWR1,ELOA3DP,LNCDAT,LOC124902537,RNF228,PANO1,classes
TCGA-A2-A25D-01A-12R-A16F-07,3.321928,10.802516,10.41257,8.761551,9.859535,11.256209,12.131857,11.154185,11.294621,9.082149,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.459432,4.459432,Primary Tumor
TCGA-BH-A201-01A-11R-A14M-07,4.954196,11.309476,10.865733,9.799282,9.475733,12.087794,11.087463,11.272047,11.890644,9.355351,...,0.0,1.0,0.0,1.584963,0.0,0.0,0.0,3.169925,4.857981,Primary Tumor
TCGA-AC-A23C-01A-12R-A169-07,6.83289,12.538189,11.606868,10.055282,8.848623,12.170551,12.151017,11.145932,12.099677,10.318543,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.066089,5.0,Primary Tumor
TCGA-AR-A5QP-01A-11R-A28M-07,5.672425,11.074141,10.405141,8.891784,7.894818,11.253257,11.149112,11.365229,10.903129,8.84235,...,1.0,0.0,0.0,1.584963,0.0,0.0,0.0,2.584963,5.0,Primary Tumor
TCGA-C8-A12P-01A-11R-A115-07,0.0,11.626622,10.463524,9.317413,8.873444,12.127027,11.338179,10.645658,10.631177,8.98014,...,0.0,0.0,0.0,3.807355,0.0,3.169925,0.0,1.0,3.807355,Primary Tumor


In [16]:
# Encode the target variable y_train and y_test
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train_encoded)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [20]:
feature_importances = model.feature_importances_
feature_names = X.columns

In [22]:
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

In [24]:
top_genes_df = importance_df.sort_values(by='Importance', ascending=False).head(1000)
top_genes_df.to_csv('top_1000_genes.csv', index=False)
