In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/breast-cancer-dataset/breast-cancer.csv


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pickle
import os
import warnings


warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import SGD


np.random.seed(42)
tf.random.set_seed(42)


tf.config.set_visible_devices([], 'GPU')

print("TensorFlow Version:", tf.__version__)
print("Keras Version:", keras.__version__)
print("Using CPU for training\n")


print("Loading dataset...")
df = pd.read_csv('/kaggle/input/breast-cancer-dataset/breast-cancer.csv')


print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())


print("\n" + "="*50)
print("PREPROCESSING")
print("="*50)


if 'id' in df.columns:
    df = df.drop('id', axis=1)
    print("Dropped 'id' column")

if 'Unnamed: 32' in df.columns:
    df = df.drop('Unnamed: 32', axis=1)
    print("Dropped 'Unnamed: 32' column")

# Encode the diagnosis column (M = 1, B = 0)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
print("Encoded diagnosis column (M=1, B=0)")

# Separate features and target
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

print("\nFeatures shape:", X.shape)
print("Target distribution:")
print(y.value_counts())
print(f"Class balance: {y.value_counts()[0]} Benign, {y.value_counts()[1]} Malignant")

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print("\nTraining set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

# Standardize the features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print("Features standardized")

# Save the scaler for later use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(sc, f)
print("Scaler saved as 'scaler.pkl'")

# Build the ANN Model
print("\n" + "="*50)
print("BUILDING MODEL")
print("="*50)

classifier = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(units=16, activation='relu', name='hidden_layer_1'),
    Dense(units=8, activation='relu', name='hidden_layer_2'),
    Dense(units=1, activation='sigmoid', name='output_layer')
], name='Breast_Cancer_ANN')

# Compile the model with specified hyperparameters
sgd_optimizer = SGD(learning_rate=0.01, momentum=0.0, nesterov=False)

classifier.compile(
    optimizer=sgd_optimizer,
    loss='mean_squared_error',
    metrics=['accuracy']
)

# Display model architecture
print("\nModel Architecture:")
print(classifier.summary())

print("\nHyperparameters:")
print("- Learning Rate: 0.01")
print("- Batch Size: 16")
print("- Epochs: 50")
print("- Loss Function: Mean Squared Error")
print("- Optimizer: Stochastic Gradient Descent")

# Train the model
print("\n" + "="*50)
print("TRAINING MODEL")
print("="*50)

history = classifier.fit(
    X_train, y_train,
    batch_size=16,
    epochs=50,
    validation_split=0.2,
    verbose=1,
    shuffle=True
)

# Save the trained model
classifier.save('breast_cancer_ann_model.h5')
print("\n✓ Model saved as 'breast_cancer_ann_model.h5'")

# Make predictions
print("\n" + "="*50)
print("MAKING PREDICTIONS")
print("="*50)

y_pred_prob = classifier.predict(X_test, verbose=0)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

print(f"Predictions completed for {len(y_pred)} samples")

# Evaluate the model
print("\n" + "="*50)
print("MODEL EVALUATION")
print("="*50)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)
print("\nFormat: [[TN, FP],")
print("         [FN, TP]]")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n{'='*50}")
print(f"Overall Accuracy: {accuracy * 100:.2f}%")
print(f"{'='*50}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))

# Calculate detailed metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()

# Avoid division by zero
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
f1_score = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0

print(f"\nDetailed Metrics:")
print(f"├─ True Negatives (TN):  {tn}")
print(f"├─ False Positives (FP): {fp}")
print(f"├─ False Negatives (FN): {fn}")
print(f"└─ True Positives (TP):  {tp}")
print(f"\n├─ Sensitivity (Recall/TPR): {sensitivity * 100:.2f}%")
print(f"├─ Specificity (TNR):        {specificity * 100:.2f}%")
print(f"├─ Precision (PPV):          {precision * 100:.2f}%")
print(f"└─ F1-Score:                 {f1_score * 100:.2f}%")

# Plot training history
print("\n" + "="*50)
print("GENERATING VISUALIZATIONS")
print("="*50)

plt.style.use('default')
fig = plt.figure(figsize=(14, 5))

# Plot accuracy
ax1 = plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', linewidth=2, marker='o', markersize=4)
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2, marker='s', markersize=4)
plt.title('Model Accuracy Over Epochs', fontsize=14, fontweight='bold')
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)

# Plot loss
ax2 = plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', linewidth=2, marker='o', markersize=4)
plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=2, marker='s', markersize=4)
plt.title('Model Loss Over Epochs', fontsize=14, fontweight='bold')
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss (MSE)', fontsize=12)
plt.legend(loc='upper right', fontsize=10)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_history.png', dpi=150, bbox_inches='tight')
print("✓ Training history plot saved as 'training_history.png'")
plt.close()

# Plot confusion matrix
fig = plt.figure(figsize=(10, 8))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix\nBreast Cancer Classification', fontsize=16, fontweight='bold', pad=20)
plt.colorbar()

tick_marks = np.arange(2)
classes = ['Benign', 'Malignant']
plt.xticks(tick_marks, classes, fontsize=12)
plt.yticks(tick_marks, classes, fontsize=12)

# Add text annotations
thresh = cm.max() / 2
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black",
                fontsize=20, fontweight='bold')

plt.ylabel('True Label', fontsize=14, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150, bbox_inches='tight')
print("✓ Confusion matrix plot saved as 'confusion_matrix.png'")
plt.close()

# Additional visualization: Prediction distribution
fig = plt.figure(figsize=(10, 6))
prediction_counts = pd.Series(y_pred).value_counts().sort_index()
colors = ['#4CAF50', '#F44336']
bars = plt.bar(['Benign (0)', 'Malignant (1)'], prediction_counts.values, color=colors, alpha=0.7, edgecolor='black')
plt.title('Distribution of Predictions on Test Set', fontsize=14, fontweight='bold')
plt.ylabel('Number of Samples', fontsize=12)
plt.xlabel('Prediction Class', fontsize=12)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('prediction_distribution.png', dpi=150, bbox_inches='tight')
print("✓ Prediction distribution plot saved as 'prediction_distribution.png'")
plt.close()

# Summary Report
print("\n" + "="*50)
print("TRAINING SUMMARY")
print("="*50)
print(f"\n✓ Dataset: Breast Cancer Wisconsin")
print(f"✓ Total Samples: {len(df)}")
print(f"✓ Training Samples: {len(X_train)}")
print(f"✓ Test Samples: {len(X_test)}")
print(f"✓ Features: {X_train.shape[1]}")
print(f"\n✓ Model Architecture: 3 Layers")
print(f"  ├─ Input → Hidden 1: {X_train.shape[1]} → 16 (ReLU)")
print(f"  ├─ Hidden 1 → Hidden 2: 16 → 8 (ReLU)")
print(f"  └─ Hidden 2 → Output: 8 → 1 (Sigmoid)")
print(f"\n✓ Hyperparameters:")
print(f"  ├─ Learning Rate: 0.01")
print(f"  ├─ Batch Size: 16")
print(f"  ├─ Epochs: 50")
print(f"  ├─ Loss Function: MSE")
print(f"  └─ Optimizer: SGD")
print(f"\n✓ Final Test Accuracy: {accuracy * 100:.2f}%")
print(f"✓ F1-Score: {f1_score * 100:.2f}%")

print("\n" + "="*50)
print("✓ TRAINING COMPLETED SUCCESSFULLY!")
print("="*50)

print("\nGenerated Files:")
print("├─ breast_cancer_ann_model.h5")
print("├─ scaler.pkl")
print("├─ training_history.png")
print("├─ confusion_matrix.png")
print("└─ prediction_distribution.png")

print("\nNext Step: Run 'streamlit run app.py' to launch the web interface!")
print("="*50)

TensorFlow Version: 2.19.0
Keras Version: 3.10.0
Using CPU for training

Loading dataset...
Dataset shape: (569, 32)

First few rows:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4     

None

Hyperparameters:
- Learning Rate: 0.01
- Batch Size: 16
- Epochs: 50
- Loss Function: Mean Squared Error
- Optimizer: Stochastic Gradient Descent

TRAINING MODEL
Epoch 1/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.5240 - loss: 0.2408 - val_accuracy: 0.7033 - val_loss: 0.1950
Epoch 2/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6039 - loss: 0.2185 - val_accuracy: 0.7692 - val_loss: 0.1746
Epoch 3/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6687 - loss: 0.1994 - val_accuracy: 0.8132 - val_loss: 0.1571
Epoch 4/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7028 - loss: 0.1818 - val_accuracy: 0.8462 - val_loss: 0.1422
Epoch 5/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7523 - loss: 0.1654 - val_accuracy: 0.8571 - val_loss: 0.1297
Epoch 6/50
[1m23/23[0m [32m━




✓ Model saved as 'breast_cancer_ann_model.h5'

MAKING PREDICTIONS
Predictions completed for 114 samples

MODEL EVALUATION

Confusion Matrix:
[[70  2]
 [ 4 38]]

Format: [[TN, FP],
         [FN, TP]]

Overall Accuracy: 94.74%

Classification Report:
              precision    recall  f1-score   support

      Benign       0.95      0.97      0.96        72
   Malignant       0.95      0.90      0.93        42

    accuracy                           0.95       114
   macro avg       0.95      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


Detailed Metrics:
├─ True Negatives (TN):  70
├─ False Positives (FP): 2
├─ False Negatives (FN): 4
└─ True Positives (TP):  38

├─ Sensitivity (Recall/TPR): 90.48%
├─ Specificity (TNR):        97.22%
├─ Precision (PPV):          95.00%
└─ F1-Score:                 92.68%

GENERATING VISUALIZATIONS
✓ Training history plot saved as 'training_history.png'
✓ Confusion matrix plot saved as 'confusion_matrix.png'
✓ Predicti