<a href="https://colab.research.google.com/github/slibolt/ADS500B/blob/main/ADS502_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Import and Setup

In [None]:
#library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#import data
df_original = pd.read_csv("breast_cancer.csv") #URL from raw github
df_original.head()


In [None]:
#data trim, per study "best predictive accuracy obtained using one separating plane in the 3-D space of Worst Area, Worst Smoothness and Mean Texture."
df_trim = df_original[['diagnosis', 'area_worst', 'smoothness_worst', 'texture_mean', 'symmetry_se', 'fractal_dimension_mean']]
df_trim.head()


In [None]:
#convert diagnosis into binary
df_dummy = pd.get_dummies(df_trim['diagnosis'])
#concatnate into dataframe
df= pd.concat((df_dummy,df_trim), axis = 1)
df = df.drop(['diagnosis'], axis = 1)
df = df.drop(['B'], axis = 1)
df['M'] = df['M'].astype(int)

df.head()

In [None]:
df['M'].dtype

# **Basic Data Information**

In [None]:
#get shape
df.shape

In [None]:
#are there duplicates?
df.duplicated().sum()

In [None]:
sns.countplot(x='M', data=df)

In [None]:
#percentage of binary class
print("percentage of each class", df['M'].value_counts()/len(df)*100)

# **Data Quality Report**

## **Continuous Features**

In [None]:
# identify continuous features
conf = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
conf

In [None]:
#get summary stats
df.describe()

In [None]:
data_quality_conf = pd.DataFrame({
    'Feature': conf,
    'Count': df.count().values,
    'Missing Values': df.isnull().sum().values,
    'Cardinality': df.nunique().values,
    'Min': df.min().values,
    '1st Quartile': df.quantile(0.25).values,
    'Mean': df.mean().values,
    'Median': df.median().values,
    '3rd Quartile': df.quantile(0.75).values,
    'Max': df.max().values,
    'Standard Deviation': df.std().values,
})
data_quality_conf

# **Univariate Analysis**

In [None]:
#plot histograms for numerical variables
plt.style.use('ggplot')
for column in conf:
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 2, 1)
    sns.histplot(df[column], kde = True)
    plt.title(f'Distribution of {column}')
    plt.show()

In [None]:
#plot boxplots of all continuous features
plt.style.use('ggplot')
for column in conf:
    if column != 'M':
        plt.figure(figsize=(20, 4))
        plt.subplot(1, 2, 1)
        sns.boxplot(x=df[column])
        plt.title(f'Boxplot of {column}')
        plt.show()

# **Multivariate Analysis**

In [None]:
#correlations, all
corr_matrix = df.corr()
corr_matrix

In [None]:
# Create a heatmap
plt.figure(figsize=(16, 12))
heatmap = sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5, annot_kws={"size": 8})

# Rotate the x and y labels for better readability
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

# Show the heatmap
plt.show()

In [None]:
sns.pairplot(df, hue ="M", height=3)

# **Feature Scaling**


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Isolate features (X) and target (y)
X = df[['area_worst', 'smoothness_worst', 'texture_mean']]
y = df['M']

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# **Stratified K-Fold Partitioning**

In [None]:
# @title
# Initialize StratifiedKFold with 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# get list of partitions
def get_partitions(X, y):
  partitions = []
  # Performing stratified k-fold cross-validation
  for train_index, test_index in skf.split(X, y):
      X_train, X_test = X_scaled[train_index], X_scaled[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]
      partitions.append((X_train, X_test, y_train, y_test))
  return partitions

## **Visualizations**

In [None]:
# Define color maps for visualization
cmap_cv = plt.get_cmap('coolwarm')
cmap_data = plt.get_cmap('tab10')

# Define visualization function for cross-validation indices
def plot_cv_indices(cv, X, y, ax, n_splits, lw=10):

    #Create a plot for indices of a cross-validation object
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1  # Testing set
        indices[tr] = 0  # Training set
        ax.scatter(range(len(indices)), [ii + 0.5] * len(indices), c=indices, marker="_", lw=lw, cmap=cmap_cv, vmin=-0.2, vmax=1.2)

    ax.scatter(range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data)
    yticklabels = list(range(n_splits)) + ["class"]
    ax.set(yticks=np.arange(n_splits + 1) + 0.5, yticklabels=yticklabels, xlabel="Sample index", ylabel="CV iteration", ylim=[n_splits + 1.2, -0.2], xlim=[0, len(X)])
    ax.set_title("Cross-Validation Splits", fontsize=15)
    return ax

# Creating a plot
fig, ax = plt.subplots(figsize=(12, 6))
plot_cv_indices(skf, X_scaled, y, ax, n_splits=10)
plt.show()

# Plotting fold distribution
def plot_fold_distribution(cv, X, y, ax):
    fold_sizes = [np.sum(y.iloc[tt] == 1) for _, tt in cv.split(X, y)]
    class_0 = [np.sum(y.iloc[tt] == 0) for _, tt in cv.split(X, y)]
    class_1 = [np.sum(y.iloc[tt] == 1) for _, tt in cv.split(X, y)]

    df_fold = pd.DataFrame({'Fold': list(range(len(fold_sizes))), 'Class 0': class_0, 'Class 1': class_1})
    df_fold.set_index('Fold').plot(kind='bar', ax=ax)
    ax.set_xlabel('Fold')
    ax.set_ylabel('Count')
    ax.set_title('Distribution of Classes Across Folds')

fig, ax = plt.subplots(figsize=(10, 6))
plot_fold_distribution(skf, X_scaled, y, ax)
plt.show()

# Plotting class distribution heatmap
def plot_class_distribution_heatmap(cv, X, y, ax):
    fold_class_distribution = []
    for train_idx, test_idx in cv.split(X, y):
        fold_class_distribution.append(np.bincount(y.iloc[test_idx], minlength=2))

    df_class_dist = pd.DataFrame(fold_class_distribution, columns=['Class 0', 'Class 1'])
    sns.heatmap(df_class_dist, annot=True, cmap='Blues', fmt='d', ax=ax)
    ax.set_xlabel('Class')
    ax.set_ylabel('Fold')
    ax.set_title('Class Distribution Across Folds')

fig, ax = plt.subplots(figsize=(10, 6))
plot_class_distribution_heatmap(skf, X_scaled, y, ax)
plt.show()


# Models

In [None]:
#set up Model Evaluation Table
model_evaluation_table = {
    'Evaluation Measure': ['Accuracy', 'Error Rate', 'Recall', 'Precision', 'F1 Score', 'ROC AUC'],
      'Logistic Regression (Baseline)': [0] * 6,  # Initialize with placeholder values (replace 0 with actual metrics later)
    'Neural Network': [0] * 6,
    'Random Forest': [0] * 6,
    'Naive Bayes': [0] * 6
}
model_evaluation_df = pd.DataFrame(model_evaluation_table)
model_evaluation_df

In [None]:
#  format metric
def format_metric(metric):
    return f'{metric:.4f}'


## Baseline Model (Logistic Regression)

In [None]:
#create table to store metrics for cross validation
metrics_logreg = pd.DataFrame(columns =['Fold','Accuracy', 'Error Rate', 'Recall', 'Precision', 'F1 Score', 'ROC AUC'])

In [None]:
# Initialize lists to store metrics
metrics_list = []
accuracy_scores = []
precisions = []
recalls = []
f1_scores = []
roc_aucs = []

#initialize fold counter
fold_counter = 1

for part in get_partitions(X_scaled, y):
    # Unpack partition into constituent variables
    (X_train, X_test, y_train, y_test) = part

    # Initializing the logistic regression model
    lg_model = LogisticRegression(max_iter=10000)

    # Train the model on the training data
    lg_model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = lg_model.predict(X_test)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_test, y_pred)
    error_rate = 1 - accuracy
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    y_prob = lg_model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)

    # Append metrics to lists
    accuracy_scores.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    roc_aucs.append(roc_auc)


     # Collect metrics for this fold
    metrics_list.append({
        'Fold': fold_counter,
        'Accuracy': accuracy,
        'Error Rate': error_rate,
        'Recall': recall,  # Specificity should be calculated differently, using recall here for demonstration
        'Precision': precision,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # # Increment fold counter
    fold_counter += 1

metrics_logreg = pd.DataFrame(metrics_list)
metrics_logreg


In [None]:
#get average metrics across folds
mean_metrics_logreg = metrics_logreg.loc[:, metrics_logreg.columns != 'Fold'].mean()
mean_metrics_logreg

In [None]:
#update comparison table
model_evaluation_df['Logistic Regression (Baseline)'] = [
    format_metric(mean_metrics_logreg['Accuracy']),
    format_metric(mean_metrics_logreg['Error Rate']),
    format_metric(mean_metrics_logreg['Recall']),
    format_metric(mean_metrics_logreg['Precision']),
    format_metric(mean_metrics_logreg['F1 Score']),
    format_metric(mean_metrics_logreg['ROC AUC'])
]
model_evaluation_df

## Neural Network

In [None]:
#create table to store metrics for Neural Network
metrics_nn = pd.DataFrame(columns =['Fold','Accuracy', 'Error Rate', 'Recall', 'Precision', 'F1 Score', 'ROC AUC'])

In [None]:
from sklearn.neural_network import MLPClassifier

# Initialize lists to store metrics
metrics_list_nn = []
accuracy_scores = []
precisions = []
recalls = []
f1_scores = []
roc_aucs = []

#initialize Fold Counter
fold_counter = 1

In [None]:
for part in get_partitions(X_scaled, y):
    # Unpack partition into constituent variables
    (X_train, X_test, y_train, y_test) = part

    clf = MLPClassifier(solver='lbfgs', max_iter=800, alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=62)

    # train the model
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = clf.predict(X_test)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    y_prob = clf.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)

    # Append metrics to lists
    accuracy_scores.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    roc_aucs.append(roc_auc)

     #Collect Metrics for this fold:
    metrics_list_nn.append({
        'Fold': fold_counter,
        'Accuracy': accuracy,
        'Error Rate': error_rate,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # Increment fold counter
    fold_counter += 1

metrics_nn = pd.DataFrame(metrics_list_nn)
metrics_nn

In [None]:
#get average metrics across folds
mean_metrics_nn = metrics_nn.loc[:, metrics_logreg.columns != 'Fold'].mean()

In [None]:
#update comparison table
model_evaluation_df['Neural Network'] = [
    format_metric(mean_metrics_nn['Accuracy']),
    format_metric(mean_metrics_nn['Error Rate']),
    format_metric(mean_metrics_nn['Recall']),
    format_metric(mean_metrics_nn['Precision']),
    format_metric(mean_metrics_nn['F1 Score']),
    format_metric(mean_metrics_nn['ROC AUC'])
]
model_evaluation_df

##**Random Forest**

In [None]:
#create a table to store metrics for cross validation
metrics_rf = pd.DataFrame(columns =['Fold','Accuracy', 'Error Rate', 'Recall', 'Precision', 'F1 Score', 'ROC AUC'])

In [None]:
#import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Initialize lists to store metrics
metrics_list_rf = []
accuracy_scores = []
precisions = []
recalls = []
f1_scores = []
roc_aucs = []

#initialize fold counter
fold_counter = 1



In [None]:
for part in get_partitions(X_scaled, y):
    # Unpack partition into constituent variables
    (X_train, X_test, y_train, y_test) = part

    #initializing random forest
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model on the training data
    rf_model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = rf_model.predict(X_test)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_test, y_pred)
    error_rate = 1 - accuracy
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    y_prob = rf_model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)

    # Append metrics to lists
    accuracy_scores.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    roc_aucs.append(roc_auc)


     # Collect metrics for this fold
    metrics_list_rf.append({
        'Fold': fold_counter,
        'Accuracy': accuracy,
        'Error Rate': error_rate,
        'Recall': recall,  # Specificity should be calculated differently, using recall here for demonstration
        'Precision': precision,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # Increment fold counter
    fold_counter += 1

metrics_rf = pd.DataFrame(metrics_list_rf)
metrics_rf


In [None]:
#get average metrics accross folds
mean_metrics_rf = metrics_rf.mean()

In [None]:
#update comparison table
model_evaluation_df['Random Forest'] = [
    format_metric(mean_metrics_rf['Accuracy']),
    format_metric(mean_metrics_rf['Error Rate']),
    format_metric(mean_metrics_rf['Recall']),
    format_metric(mean_metrics_rf['Precision']),
    format_metric(mean_metrics_rf['F1 Score']),
    format_metric(mean_metrics_rf['ROC AUC'])
]
model_evaluation_df

## **Naive Bayes**

In [None]:
#create table for nb metrics
metrics_nb = pd.DataFrame(columns =['Fold','Accuracy', 'Error Rate', 'Recall', 'Precision', 'F1 Score', 'ROC AUC'])

In [None]:
#initialize list to store metrics
metrics_list_nb = []
accuracy_scores = []
precisions = []
recalls = []
f1_scores = []
roc_aucs = []

#initialize fold counter
fold_counter = 1

In [None]:
for part in get_partitions(X_scaled, y):
    # Unpack partition into constituent variables
    (X_train, X_test, y_train, y_test) = part

    # Initialize Naive Bayes model
    nb_model = GaussianNB()

    # Train the model on the training data
    nb_model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = nb_model.predict(X_test)
    y_prob = nb_model.predict_proba(X_test)[:, 1]

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_test, y_pred)
    error_rate = 1 - accuracy
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_prob)

    # Append metrics to lists
    accuracy_scores.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    roc_aucs.append(roc_auc)

    # Collect metrics for this fold
    metrics_list_nb.append({
        'Fold': fold_counter,
        'Accuracy': accuracy,
        'Error Rate': error_rate,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    })

    # Increment fold counter
    fold_counter += 1

metrics_nb = pd.DataFrame(metrics_list_nb)
metrics_nb

In [None]:
#get average metrics across folds
mean_metrics_nb = metrics_nb.mean()

In [None]:
#update comparison table
model_evaluation_df['Naive Bayes'] = [
    format_metric(mean_metrics_nb['Accuracy']),
    format_metric(mean_metrics_nb['Error Rate']),
    format_metric(mean_metrics_nb['Recall']),
    format_metric(mean_metrics_nb['Precision']),
    format_metric(mean_metrics_nb['F1 Score']),
    format_metric(mean_metrics_nb['ROC AUC'])
]
model_evaluation_df

# Model Comparison

In [None]:
# Plotting the ROC curves for all models
mean_fpr = np.linspace(0, 1, 100)
plt.figure(figsize=(10, 6))

for model, name in zip([lg_model, clf, rf_model, nb_model], ['Logistic Regression','Neural Network', 'Random Forest', 'Naive Bayes']):
    mean_tpr = np.zeros_like(mean_fpr)
    for train_index, test_index in skf.split(X_scaled, y):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        mean_tpr += np.interp(mean_fpr, fpr, tpr)

    mean_tpr /= skf.get_n_splits()
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, lw=2, label=f'{name} (area = {mean_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()