# Import Library

In [None]:
#data handling
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import MinMaxScaler

#feature selection
from sklearn.feature_selection import mutual_info_classif

In [None]:
#classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# performance metrics
from sklearn.metrics import balanced_accuracy_score,f1_score,precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import roc_auc_score

In [None]:
#read data directly from a github repository

all_url='https://media.githubusercontent.com/media/pritambera2000/BU_Bioinformatics/main/Data/Cancer/normalized_expr_ALL.csv'
cll_url='https://media.githubusercontent.com/media/pritambera2000/BU_Bioinformatics/main/Data/Cancer/normalized_expr_CLL.csv'

df_all = pd.read_csv(all_url)
df_cll = pd.read_csv(cll_url)

In [None]:
# Concatenate the two DataFrames row-wise
cancer_df = pd.concat([df_all, df_cll], ignore_index=True)

In [None]:
cancer_df

# Data Exploration & Cleaning

In [None]:
#let's check some of the columns (first, second and third columns)
print(cancer_df.columns[0:3])

In [None]:
print(cancer_df['cancer_type'].value_counts())

In [None]:
#plot a bar chat to display the class distribution

cancer_df['cancer_type'].value_counts().plot.bar()

# Data Processing

In [None]:
#we will now seperate the feature values from the class. we do this because scikit-learn requires that features and class are separated before parsing them to the classifiers.

X=cancer_df.iloc[:,0:-1]
y=cancer_df.iloc[:,-1]

In [None]:
X

In [None]:
y

# Encode Labels

In [None]:
#let's encode target labels (y) with values between 0 and n_classes-1.
#encoding will be done using the LabelEncoder
label_encoder=LabelEncoder()
label_encoder.fit(y)
y_encoded=label_encoder.transform(y)
labels=label_encoder.classes_
classes=np.unique(y_encoded)

In [None]:
labels


In [None]:
classes

In [None]:
y_encoded

In [None]:
# Get unique elements and their counts
unique_elements, counts = np.unique(y_encoded, return_counts=True)

# Zip the unique elements and their counts for better readability
counts_dict = dict(zip(unique_elements, counts))

# Print the counts
for element, count in counts_dict.items():
    print(f"Element {element}: Count {count}")

# Data Spilting

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y_encoded,test_size=0.2,random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
# Get unique elements and their counts
unique_elements, counts = np.unique(y_train, return_counts=True)

# Zip the unique elements and their counts for better readability
counts_dict = dict(zip(unique_elements, counts))

# Print the counts
for element, count in counts_dict.items():
    print(f"Element {element}: Count {count}")

In [None]:
y_test

In [None]:
# Get unique elements and their counts
unique_elements, counts = np.unique(y_test, return_counts=True)

# Zip the unique elements and their counts for better readability
counts_dict = dict(zip(unique_elements, counts))

# Print the counts
for element, count in counts_dict.items():
    print(f"Element {element}: Count {count}")

# Data Normalization

In [None]:
# scale data between 0 and 1

min_max_scaler=MinMaxScaler()
X_train_norm=min_max_scaler.fit_transform(X_train)
X_test_norm=min_max_scaler.fit_transform(X_test)

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_train_norm:", X_train_norm.shape)

print("Shape of X_test:", X_test.shape)
print("Shape of X_test_norm:", X_test_norm.shape)

In [None]:
# Retrieve the gene names from the original DataFrame
gene_names = cancer_df.columns[:-1]  # Exclude the last column which is the target class

# Convert X_train_norm to a DataFrame with gene names as columns
X_train_norm_df = pd.DataFrame(X_train_norm, columns=gene_names)

# Display the DataFrame
print(X_train_norm_df)

# Feature Selction
## Mutual Information Algorithm is used to compute the relevance of each feature.

In [None]:
MI=mutual_info_classif(X_train,y_train)

In [None]:
n_features= 50
selected_scores_indices=np.argsort(MI)[::-1][0:n_features]

In [None]:
selected_scores_indices

In [None]:
X_train_selected=X_train_norm[:,selected_scores_indices]
X_test_selected=X_test_norm[:,selected_scores_indices]

In [None]:
# Extract the expression values of the selected genes
selected_genes_expr_train = X_train_norm[:, selected_scores_indices]

# Extract the names of the selected genes
selected_genes_names_train = X_train.columns[selected_scores_indices]

# Create a DataFrame with selected genes and their expression values
selected_genes_df_train = pd.DataFrame(selected_genes_expr_train, columns=selected_genes_names_train)

# Print the first few rows of the DataFrame
print("Selected Genes and Their Expression Values (Training):")
print(selected_genes_df_train.head())

In [None]:
# Select the 29905th column from the normalized data
column_29905 = X_train_norm[:, 29905]

# Print the data
print(column_29905)

# Model Traning

# 1. Random Forest

In [None]:
RF=OneVsRestClassifier(RandomForestClassifier(max_features=0.2))
RF.fit(X_train_selected,y_train)
y_pred =RF.predict(X_test_selected)
pred_prob = RF.predict_proba(X_test_selected)

In [None]:
#accuracy
accuracy = accuracy_score(y_test, y_pred)
print("accuracy:", accuracy)
# Balanced accuracy
balanced_accuracy=np.round(balanced_accuracy_score(y_test,y_pred),4)
print('balanced accuracy:%0.4f'%balanced_accuracy)

#precision
precision=np.round(precision_score(y_test,y_pred,average = 'weighted'),4)
print('precision:%0.4f'%precision)

#recall
recall=np.round(recall_score(y_test,y_pred,average = 'weighted'),4)
print('recall:%0.4f'%recall)

#f1score
f1score=np.round(f1_score(y_test,y_pred,average = 'weighted'),4)
print('f1score:%0.4f'%f1score)


report=classification_report(y_test,y_pred, target_names=labels)
print('\n')
print('classification report\n\n')
print(report)

In [None]:
#generate confusion matrix
cm=confusion_matrix(y_test,y_pred)
cm_df=pd.DataFrame(cm,index=labels,columns=labels)

In [None]:
cm_df

# Support Vector Machine

In [None]:
svm = SVC(kernel='linear',C=1.0,random_state=42)
svm.fit(X_train_selected, y_train)

In [None]:
y_pred = svm.predict(X_test_selected)

results_df = pd.DataFrame({'Real': y_test, 'Predicted': y_pred})
print(results_df)