In [1]:
# Importing necessary libraries
import pandas as pd  # For data manipulation and analysis
from sklearn.datasets import load_breast_cancer  # To load the built-in Breast Cancer dataset
from sklearn.feature_selection import SelectKBest, f_classif  # For feature selection
from sklearn.model_selection import GridSearchCV, train_test_split  # For model training and hyperparameter tuning
from sklearn.neural_network import MLPClassifier  # Multi-Layer Perceptron Classifier for ANN
from sklearn.metrics import classification_report  # To evaluate the model
import pickle  # For saving and loading intermediate data and models

# Dataset Preparation

In [2]:
# Load the breast cancer dataset from Scikit-learn
data = load_breast_cancer()
# Convert the dataset into a Pandas DataFrame for easy manipulation
df = pd.DataFrame(data.data, columns=data.feature_names)
# Add the target column (cancer type: malignant/benign) to the DataFrame
df['target'] = data.target

# Handle missing values, if any, by removing rows with null values
df = df.dropna()

# Save the prepared dataset as a CSV file for inspection and reuse
df.to_csv('breast_cancer_data.csv', index=False)
# Save the dataset as a Pickle file to ensure fast reloading in future steps
with open('breast_cancer_data.pkl', 'wb') as f:
    pickle.dump(df, f)

print("Dataset saved as 'breast_cancer_data.csv' and 'breast_cancer_data.pkl'.")

Dataset saved as 'breast_cancer_data.csv' and 'breast_cancer_data.pkl'.


# Step 2: Feature Selection

In [3]:
# Load the dataset from the Pickle file
with open('breast_cancer_data.pkl', 'rb') as f:
    df = pickle.load(f)

# Separate the features (X) and the target variable (y)
X = df.drop(columns=['target'])
y = df['target']

# Use SelectKBest to select the top 10 features based on ANOVA F-statistic
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)

# Save the selected feature names and the selector object
selected_features = X.columns[selector.get_support()]
with open('selected_features.pkl', 'wb') as f:
    pickle.dump(selected_features, f)
with open('feature_selector.pkl', 'wb') as f:
    pickle.dump(selector, f)

print('Selected Features:', selected_features)

Selected Features: Index(['mean radius', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'worst radius', 'worst perimeter', 'worst area',
       'worst concavity', 'worst concave points'],
      dtype='object')


# Step 3: Model Training with Grid Search

In [4]:
# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the ANN model (MLPClassifier) with a maximum of 1000 iterations
model = MLPClassifier(max_iter=1000)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],  # Number of neurons in the hidden layer
    'activation': ['relu', 'tanh'],  # Activation functions to test
    'solver': ['adam', 'sgd'],  # Optimization solvers to test
    'alpha': [0.0001, 0.001, 0.01]  # Regularization strengths
}

# Use GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Save the Grid Search object for reuse
with open('grid_search.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

# Print the best parameters found by Grid Search
print('Best Parameters:', grid_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'solver': 'adam'}


# Step 4: Model Evaluation

In [5]:
# Load the best model from the Grid Search results
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model using a classification report
print("Classification Report (Grid Search Best Model):")
print(classification_report(y_test, y_pred))

Classification Report (Grid Search Best Model):
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



# Step 5: Train and Save Standalone ANN Model

In [6]:
# Train a standalone ANN model with optimal parameters
standalone_model = MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam', max_iter=1000)
standalone_model.fit(X_train, y_train)

# Save the standalone model
with open('standalone_ann_model.pkl', 'wb') as f:
    pickle.dump(standalone_model, f)

# Evaluate the standalone model
y_pred_standalone = standalone_model.predict(X_test)
print("Classification Report (Standalone ANN Model):")
print(classification_report(y_test, y_pred_standalone))

Classification Report (Standalone ANN Model):
              precision    recall  f1-score   support

           0       0.91      0.95      0.93        43
           1       0.97      0.94      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.95      0.94       114
weighted avg       0.95      0.95      0.95       114

