# 🧬 Ovarian Cancer Classification - Upgraded ML Pipeline

This notebook uses real gene expression data to classify ovarian cancer using multiple ML models and evaluates them using accuracy, confusion matrix, and ROC-AUC.

In [None]:
# 📦 Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

%matplotlib inline


In [None]:
# 📂 Upload Dataset
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['realistic_ovarian_gene_expression.csv']))
df.head()


In [None]:
# 📊 Class Distribution
sns.countplot(data=df, x='Cancer')
plt.title('Class Distribution (0=Normal, 1=Cancer)')
plt.show()


In [None]:
# 🧪 Train/Test Split
X = df.drop('Cancer', axis=1)
y = df['Cancer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 🌲 Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_proba = rf_model.predict_proba(X_test)[:, 1]

print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


In [None]:
# 🔁 Logistic Regression
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)
logreg_proba = logreg_model.predict_proba(X_test)[:, 1]

print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print(classification_report(y_test, logreg_pred))


In [None]:
# 💠 Support Vector Machine
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_proba = svm_model.predict_proba(X_test)[:, 1]

print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))


In [None]:
# 📉 Confusion Matrix for Random Forest
cm = confusion_matrix(y_test, rf_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Random Forest - Confusion Matrix')
plt.show()


In [None]:
# 📈 ROC Curves
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_proba)
fpr_lr, tpr_lr, _ = roc_curve(y_test, logreg_proba)
fpr_svm, tpr_svm, _ = roc_curve(y_test, svm_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, label='Random Forest (AUC = {:.2f})'.format(auc(fpr_rf, tpr_rf)))
plt.plot(fpr_lr, tpr_lr, label='Logistic Regression (AUC = {:.2f})'.format(auc(fpr_lr, tpr_lr)))
plt.plot(fpr_svm, tpr_svm, label='SVM (AUC = {:.2f})'.format(auc(fpr_svm, tpr_svm)))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.show()


In [None]:
# 🌟 Feature Importance (Random Forest)
importances = rf_model.feature_importances_
features = X.columns

# Sort top 10 genes
indices = np.argsort(importances)[-10:]
plt.figure(figsize=(8, 6))
sns.barplot(x=importances[indices], y=features[indices])
plt.title('Top 10 Important Genes (Random Forest)')
plt.xlabel('Importance')
plt.ylabel('Gene')
plt.show()
