# Workshop 4

Starter code for workshop 4. You should have seen most of it before, but make sure you understand what it is doing!

In [1]:
# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To plot even prettier figures
import seaborn as sn

# General data handling (pure numerics are better in numpy)
import pandas as pd

In [2]:
# Load the dataset

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [3]:
# This is where the numerical data is
xarray = data.data
yarray = data.target

In [4]:
# This is where the names of features and targets are
print(f'Features names are: {data.feature_names}')
print(f'Label names are: {data.target_names}')

Features names are: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Label names are: ['malignant' 'benign']


In [5]:
# We recommend inverting the labels so that malignant (the worse disease) = 1 (i.e. positive)
yarray = 1 - yarray
# Don't forget to switch the label names too (if you are going to use them anywhere)
# Though it is good practice to switch them here anyway, as future modifications to the code then won't get confused

In [6]:
# This is how you could put it all into a pandas dataframe (useful for some investigations)
fullarray = np.concatenate((xarray,np.reshape(yarray,(-1,1))),axis=1)
df = pd.DataFrame(fullarray, columns = list(data.feature_names) + ['target'])

# Splitting into separate datasets

In [7]:
from sklearn.model_selection import train_test_split

bigtrain_set,test_set = train_test_split(df,test_size=0.2, random_state = 20)
train_set, val_set = train_test_split(bigtrain_set, test_size=0.2, random_state=20)

X_train = train_set.iloc[:,:-1]
y_train = train_set.iloc[:,-1]
X_test = test_set.iloc[:,:-1]
y_test = test_set.iloc[:,-1]
X_val = val_set.iloc[:,:-1]
y_val = val_set.iloc[:,-1]
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape,X_val.shape,y_val.shape])

[(364, 30), (364,), (114, 30), (114,), (91, 30), (91,)]


# Pipeline

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Create preprocessing pipeline
preprocessing_pipeline = Pipeline([
    # Step 1: Handle missing values (if any)
    ('imputer', SimpleImputer(strategy='median')),
    
    # Step 2: Standardize features
    # This is important because the features are on different scales
    # (mean radius vs mean texture vs mean area have very different ranges)
    ('scaler', StandardScaler()),
])

# Apply the pipeline to each split individually (so the test data doesn't influence training data)
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_val_processed = preprocessing_pipeline.transform(X_val)
X_test_processed = preprocessing_pipeline.transform(X_test)

# SGD Classifier

In [9]:
# Import necessary libraries
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Create a pipeline that includes preprocessing and SGD classifier
sgd_pipeline = Pipeline([
    # First, include all preprocessing steps
    ('preprocessor', preprocessing_pipeline),
    # Then add the SGD classifier with the correct parameter
    ('classifier', SGDClassifier(loss='log_loss', random_state=42))
])

# Train the model on the training data
sgd_pipeline.fit(X_train, y_train)

# Get binary predictions (0 or 1)
y_val_pred = sgd_pipeline.predict(X_val)

# Get probability predictions (chance of being class 1)
# The second column [[:,1]] gives the probability of the positive class (1)
y_val_prob = sgd_pipeline.predict_proba(X_val)[:,1]

print("First 10 binary predictions:", y_val_pred[:10])
print("First 10 probability predictions:", y_val_prob[:10])

# Create a plot to visualize predictions vs actual values
plt.figure(figsize=(10, 6))

# Sort by actual values and then by predicted probabilities for better visualization
sort_idx = np.lexsort((y_val_prob, y_val))
sorted_y_val = y_val.values[sort_idx]
sorted_y_val_pred = y_val_pred[sort_idx]
sorted_y_val_prob = y_val_prob[sort_idx]

# Plot actual values
plt.scatter(range(len(sorted_y_val)), sorted_y_val, c='blue', label='Actual', alpha=0.5, s=100)

# Plot predicted values
plt.scatter(range(len(sorted_y_val_pred)), sorted_y_val_pred, c='red', marker='x', label='Predicted', alpha=0.5, s=100)

# Highlight incorrect predictions
incorrect = sorted_y_val != sorted_y_val_pred
plt.scatter(np.where(incorrect)[0], sorted_y_val_pred[incorrect], c='green', marker='o', s=200, facecolors='none', label='Incorrect')

plt.yticks([0, 1], ['Benign', 'Malignant'])
plt.xlabel('Sample Index')
plt.ylabel('Class')
plt.title('Actual vs Predicted Classes')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Also create a plot of the probability predictions
plt.figure(figsize=(10, 6))
plt.plot(range(len(sorted_y_val_prob)), sorted_y_val_prob, 'o-', alpha=0.6)
plt.axhline(y=0.5, color='r', linestyle='--', label='Decision Threshold (0.5)')
plt.scatter(np.where(incorrect)[0], sorted_y_val_prob[incorrect], c='green', s=100, label='Incorrect Predictions')
plt.xlabel('Sample Index')
plt.ylabel('Predicted Probability of Malignant')
plt.title('Predicted Probabilities (sorted)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Calculate confusion matrix
cm = confusion_matrix(y_val, y_val_pred)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.xticks([0.5, 1.5], ['Benign (0)', 'Malignant (1)'])
plt.yticks([0.5, 1.5], ['Benign (0)', 'Malignant (1)'])
plt.show()

# Calculate and display normalized confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Normalized Confusion Matrix (rows sum to 1.0)')
plt.xticks([0.5, 1.5], ['Benign (0)', 'Malignant (1)'])
plt.yticks([0.5, 1.5], ['Benign (0)', 'Malignant (1)'])
plt.show()

# Calculate accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate precision and recall
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {2 * (precision * recall) / (precision + recall):.4f}")

# Calculate ROC curve using binary predictions
fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)

# Calculate Area Under the Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - Binary Predictions')
plt.legend(loc="lower right")
print(f"Number of distinct points in ROC curve using binary predictions: {len(fpr)}")
print(f"Thresholds: {thresholds}")
plt.show()

# Calculate ROC curve using probability predictions
fpr_prob, tpr_prob, thresholds_prob = roc_curve(y_val, y_val_prob)

# Calculate Area Under the Curve (AUC)
roc_auc_prob = auc(fpr_prob, tpr_prob)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_prob, tpr_prob, color='blue', lw=2, label=f'ROC curve (area = {roc_auc_prob:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - Probability Predictions')
plt.legend(loc="lower right")
print(f"Number of distinct points in ROC curve using probability predictions: {len(fpr_prob)}")
plt.show()

# Let's look at wrong predictions and their probabilities
wrong_idx = np.where(y_val != y_val_pred)[0]
wrong_probs = y_val_prob[wrong_idx]
wrong_true = y_val.values[wrong_idx]
wrong_pred = y_val_pred[wrong_idx]

print("\nWrong predictions:")
for i, idx in enumerate(wrong_idx):
    print(f"Sample {idx}: True={wrong_true[i]}, Pred={wrong_pred[i]}, Prob={wrong_probs[i]:.4f}")

# Find closest thresholds to the wrong prediction probabilities
for wrong_prob in wrong_probs:
    closest_idx = np.argmin(np.abs(thresholds_prob - wrong_prob))
    print(f"For wrong prediction with prob {wrong_prob:.4f}, closest threshold: {thresholds_prob[closest_idx]:.4f}")

# Calculate precision-recall curve
precision_curve, recall_curve, thresholds_pr = precision_recall_curve(y_val, y_val_prob)

# Plot precision-recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall_curve, precision_curve, color='blue', lw=2)
plt.axhline(y=sum(y_val)/len(y_val), color='red', linestyle='--', label=f'Baseline (class distribution: {sum(y_val)/len(y_val):.2f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.grid(True, alpha=0.3)
plt.show()

# Look at different thresholds and their effect on precision and recall
threshold_examples = [0.3, 0.5, 0.7, 0.9]
results = []

for threshold in threshold_examples:
    # Convert probabilities to binary predictions based on threshold
    y_pred_custom = (y_val_prob >= threshold).astype(int)
    
    # Calculate metrics
    acc = accuracy_score(y_val, y_pred_custom)
    prec = precision_score(y_val, y_pred_custom)
    rec = recall_score(y_val, y_pred_custom)
    f1 = 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0
    
    results.append({
        'Threshold': threshold,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1 Score': f1
    })

# Display results as a dataframe
import pandas as pd
results_df = pd.DataFrame(results)
print(results_df)

InvalidParameterError: The 'loss' parameter of SGDClassifier must be a str among {'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive', 'log_loss', 'squared_error', 'squared_hinge', 'perceptron', 'modified_huber', 'hinge'}. Got 'log' instead.

# Decision Tree

In [None]:
# Your code here

# Model selection

In [None]:
# Your code here