# Cancer Diagnosis using Machine Learning (scikit-learn)

In [None]:
# Imports
import pandas as pd
import numpy as np

### Getting the data ready

Import the cancer data.

In [None]:
cancer_data = pd.read_csv("../data/cancer_data.csv")
cancer_data.head()

Check if there are any missing values.

In [None]:
# Get the sum of all missing values
cancer_data.isna().sum()

Create the feature columns and the target columns.

In [None]:
# Create X (the feature columns)
X = cancer_data.drop("diagnosis", axis=1)

# Create y (the target column)
y = cancer_data["diagnosis"]

In [None]:
# Check the head of the features DataFrame
X.head()

In [None]:
# Check the head and the value counts of the labels 
y.head(), y.value_counts()

Split the data into training and test sets

In [None]:
from sklearn.model_selection import train_test_split

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Algorithm #1: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create model instance
model = RandomForestClassifier()

### Fitting the model to the data and using it to make a prediction

In [None]:
# Fit the model on the numerical data
model.fit(X_train, y_train)

In [None]:
# Use the model to make a prediction on the test data (further evaluation)
y_preds = model.predict(X=X_test)

### Evaluating the model

Evaluate the model on the training set.

In [None]:
train_acc = model.score(X_train, y_train)
print(f"The model's accuracy on the training dataset is: {train_acc*100}%")

Evaluate the model on the test set.

In [None]:
test_acc = model.score(X_test, y_test)
print(f"The model's accuracy on the testing dataset is: {test_acc*100:.2f}%")

Create a classification report.

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

Area Under Receiver Operating Characteristic (ROC) Curve

In [None]:
from sklearn.metrics import RocCurveDisplay
roc_curve_display = RocCurveDisplay.from_estimator(estimator=model, X=X_test, y=y_test)