In [58]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
import pickle
from sklearn.datasets import load_breast_cancer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score


Step 2: Dataset Acquisition and Preparation

In [59]:
# Load the dataset
data = load_breast_cancer()
print(data)  # Displays the dataset description and content


print(data.data)          # The feature data
print(data.target)        # The labels (target values)
print(data.DESCR)         # The full description of the dataset

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 

Step 3: Feature Selection

In [60]:
# Access the feature data
features = data.data

# Display the shape of the data (number of samples, number of features)
print("Shape of feature data:", features.shape)

# Display the first few rows of the data to check the contents
print("First 5 rows of feature data:\n", features[:5])


Shape of feature data: (569, 30)
First 5 rows of feature data:
 [[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
  1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
  6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
  1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
  4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02 8.690e-02
  7.017e-02 1.812e-01 5.667e-02 5.435e-01 7.339e-01 3.398e+00 7.408e+01
  5.225e-03 1.308e-02 1.860e-02 1.340e-02 1.389e-02 3.532e-03 2.499e+01
  2.341e+01 1.588e+02 1.956e+03 1.238e-01 1.866e-01 2.416e-01 1.860e-01
  2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01 1.974e-01
  1.279e-01 2.069e-01 5.999e-02 7.456e-01 7.869e-01 4.585e+00 9.403e+01
  6.150e-03 4.006e-02 3.832e-02 2.058e-02 2.250e-02 4.571e-03 2.357e+01
  2.553e+01 1.525e+02 1.709e+03 1.444e-01 4.245e-01 4.504e-01 2.430e-01
  3.613e-01 8.758e-02]
 [1

In [61]:
# Access the feature names
feature_names = data.feature_names

# Display the feature names
print("Feature names:\n", feature_names)


Feature names:
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [62]:
# Convert the feature data into a Pandas DataFrame and set column names
data_df = pd.DataFrame(data.data, columns=data.feature_names)

# Display the first few rows of the DataFrame
data_df.head(10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [63]:
# Add the target column to the DataFrame
data_df["target"] = data.target

# Display the first few rows of the updated DataFrame
data_df.head(10)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,0
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,0
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,0
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,0
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,0


In [64]:
# Check the distribution of target values
# 1 - Malignant
# 0 - Benign
data_df["target"].value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [65]:
import plotly.graph_objects as go

In [68]:
# Compute the percentage distribution of target values
target_perc = data_df["target"].value_counts(normalize=True) * 100# Calculate the percentage of benign and malignant cases# Create a doughnut chart using Plotly
fig = go.Figure(data=[go.Pie(
    labels=target_perc.index,  # Set the labels (e.g., 0 for benign, 1 for malignant)
    values=target_perc,  # The percentages for each label
    hole=0.3,  # Creates a doughnut-shaped pie chart by adding a hole in the center
    textinfo='percent+label',  # Display both the percentage and the label on each slice
    marker=dict(colors=['#66b3ff', '#ff9999'])  # Custom colors for the slices
)])

# Update the layout to add a title and adjust the size
fig.update_layout(
    title_text='Distribution of Target Values',  # Title of the chart
    width=600,  # Width of the chart
    height=600# Height of the chart
)

# Show the plot
fig.show()  # Display the doughnut chart

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
import plotly.graph_objects as go  # Import Plotly Graph Objects for more control over the plot

# Separate the data based on the target variable
benign_data = data_df[data_df['target'] == 0]  # Data for benign cases
malignant_data = data_df[data_df['target'] == 1]  # Data for malignant cases

# Choose a feature to plot, e.g., 'mean radius'
feature = 'mean radius'

# Create a stacked histogram
fig = go.Figure()

# Add histogram for benign cases
fig.add_trace(go.Histogram(
    x=benign_data[feature],
    name='Benign',
    opacity=0.75,  # Adjust the opacity for better visibility
    marker_color='blue'
))

# Add histogram for malignant cases
fig.add_trace(go.Histogram(
    x=malignant_data[feature],
    name='Malignant',
    opacity=0.75,  # Adjust the opacity for better visibility
    marker_color='red'
))

# Update layout to stack the histograms
fig.update_layout(
    title_text=f'Stacked Histogram of {feature}',
    barmode='overlay',  # Overlay the histograms on top of each other
    xaxis_title_text=feature,  # X-axis label
    yaxis_title_text='Count',  # Y-axis label
    bargap=0.2,  # Gap between bars
    legend=dict(x=0.9, y=0.9)  # Position the legend
)

# Display the plot
fig.show()

In [None]:
# Check for missing values
print(data_df.isnull().sum())

# Normalize the features using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_df[data_df.columns[:-1]] = scaler.fit_transform(data_df[data_df.columns[:-1]])

# Display the first few rows to verify
print(data_df.head())


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
# Prepare features and target
X = data_df.drop("target", axis=1)
y = data_df["target"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Select the top 10 features using ANOVA F-test (f_classif)
skb = SelectKBest(score_func=f_classif, k=10)
X_train_skb = skb.fit_transform(X_train_scaled, y_train)
X_test_skb = skb.transform(X_test_scaled)

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_skb, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_skb)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


In [None]:

# Retrieve and print selected features
selected_features = X.columns[skb.get_support()]
print("Selected features:", selected_features)


In [None]:
X_train_skb

In [None]:
X_train_scaled

Step 4: Grid Search CV for Model Tuning

In [None]:
# Define the parameter grid for MLPClassifier
param_grid = {
    "hidden_layer_sizes": [(10,), (50,), (100,)],  # Number of neurons in each hidden layer. Single hidden layer with 10, 50, or 100 neurons.
    "activation": ["tanh", "relu"],  # Activation functions to use in the hidden layers. Options: 'tanh' or 'relu'.
    "solver": ["sgd", "adam"],  # Optimization algorithm. 'sgd' is Stochastic Gradient Descent, 'adam' is an advanced optimizer.
    "alpha": [0.0001, 0.001, 0.01, 0.05],  # L2 penalty (regularization term) to prevent overfitting. Lower values mean less regularization.
    "learning_rate": ["constant", "adaptive"]  # How the learning rate adapts during training. 'constant' or 'adaptive'.
}


Step 5: Implementing an Artificial Neural Network (ANN) Model

In [None]:
# Initialize the MLPClassifier with a maximum of 1000 iterations
mlp = MLPClassifier(max_iter=1000)

# Configure GridSearchCV with the MLPClassifier
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, n_jobs=-1, cv=5, verbose=2)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Output the best parameters
print("Best parameters: ", grid_search.best_params_)

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)


In [None]:

print("Accuracy: {:.2f}%".format(accuracy * 100))

# Extract and display the parameters and metrics from the best model
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best parameters: {best_params}")
print(f"Best cross-validation score: {best_score}")


In [None]:

# Print the classification report
print(classification_report(y_test, y_pred))


In [None]:

# Display the training and test accuracy of the best model
train_accuracy = best_model.score(X_train_scaled, y_train)
test_accuracy = best_model.score(X_test_scaled, y_test)
print(f"Training accuracy: {train_accuracy:.2f}")
print(f"Testing accuracy: {test_accuracy:.2f}")

In [None]:
X_test_scaled

In [None]:
# Save the best model and the scaler to files
with open("ann_best_model.pkl", 'wb') as model_file:
    pickle.dump(best_model, model_file)

with open("ann.pkl", 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("pickle files created")
