In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
df = pd.read_csv("datasets/breast_cancer_dataset.csv")
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
## Independent and dependent features
X = df.drop(labels=['target'],axis=1)
y = df[['target']]

In [4]:
y

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
564,0
565,0
566,0
567,0


In [5]:
# Define which columns should be should be scaled
# Here all are numerical columns 
numerical_cols = X.select_dtypes(exclude='object').columns
categorical_cols = X.select_dtypes(include='object').columns

In [6]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [8]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)

In [9]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [10]:
X_train.head()

Unnamed: 0,num_pipeline__mean radius,num_pipeline__mean texture,num_pipeline__mean perimeter,num_pipeline__mean area,num_pipeline__mean smoothness,num_pipeline__mean compactness,num_pipeline__mean concavity,num_pipeline__mean concave points,num_pipeline__mean symmetry,num_pipeline__mean fractal dimension,...,num_pipeline__worst radius,num_pipeline__worst texture,num_pipeline__worst perimeter,num_pipeline__worst area,num_pipeline__worst smoothness,num_pipeline__worst compactness,num_pipeline__worst concavity,num_pipeline__worst concave points,num_pipeline__worst symmetry,num_pipeline__worst fractal dimension
0,0.25933,2.439012,0.172382,0.156133,-0.909507,-1.090974,-0.506455,-0.541377,0.128087,-1.37035,...,0.180986,1.837433,0.061595,0.065562,-0.743905,-0.990479,-0.547035,-0.733436,0.491065,-1.251415
1,1.764239,0.363084,1.824839,1.772202,0.587236,1.301367,1.478879,2.09737,1.119787,-0.022417,...,1.609,-0.032638,1.521502,1.560719,-0.207337,0.382917,0.84135,1.540941,0.246979,-0.441615
2,0.859073,0.657657,0.883795,0.783064,-0.434606,0.429383,0.109018,0.659037,1.051269,-1.130905,...,0.815884,0.255317,0.871683,0.685023,-0.806505,0.211594,-0.185747,0.594873,0.231333,-0.490641
3,-0.904058,-0.163436,-0.886091,-0.803622,0.293482,-0.556744,-0.474767,-0.492787,-1.191775,0.490276,...,-0.804831,-0.016277,-0.735089,-0.716432,0.217445,-0.570226,-0.61198,-0.533236,-1.04542,-0.3476
4,-0.190476,-0.093851,-0.173874,-0.287084,0.678159,0.193611,-0.028177,0.123785,-0.037797,-0.264679,...,-0.318008,0.248773,-0.310521,-0.370923,0.508086,0.021733,0.362679,0.480429,-0.396091,-0.374132


In [11]:
## Model Training

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [12]:
regression = LogisticRegression()
regression.fit(X_train,y_train)

In [13]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    confusionmat = confusion_matrix(true, predicted)
    report = classification_report(true, predicted)
    return accuracy,confusionmat,report

In [14]:
y_pred = regression.predict(X_test)
accuracyy,confusionmatt,reportt = evaluate_model(y_test,y_pred)

In [15]:
print('Model Training Performance')
print("\nConfusion Matrix:\n",confusionmatt)
print("\nClassification Report: \n\n",reportt)
print("\nAccuracy:",accuracyy)

Model Training Performance

Confusion Matrix:
 [[52  1]
 [ 0 90]]

Classification Report: 

               precision    recall  f1-score   support

           0       1.00      0.98      0.99        53
           1       0.99      1.00      0.99        90

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143


Accuracy: 0.993006993006993


In [16]:
models={
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(kernel='linear'),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Neural Network (MLP)": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}
trained_model_list=[]
model_list=[]
accuracy_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    accuracy,confusionmat,report =evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    #print('Model Training Performance')
    print("\nConfusion Matrix:\n",confusionmat)
    print("\nClassification Report: \n\n",report)
    print("\nAccuracy:",accuracy*100)

    accuracy_list.append(accuracy)
    
    print('='*35)


Logistic Regression

Confusion Matrix:
 [[52  1]
 [ 0 90]]

Classification Report: 

               precision    recall  f1-score   support

           0       1.00      0.98      0.99        53
           1       0.99      1.00      0.99        90

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143


Accuracy: 99.3006993006993
Support Vector Machine

Confusion Matrix:
 [[52  1]
 [ 2 88]]

Classification Report: 

               precision    recall  f1-score   support

           0       0.96      0.98      0.97        53
           1       0.99      0.98      0.98        90

    accuracy                           0.98       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143


Accuracy: 97.9020979020979
Naive Bayes

Confusion Matrix:
 [[47  6]
 [ 5 85]]

Classification Report: 

               precision    recall  f1

In [17]:
my_dict = dict(zip(model_list, accuracy_list))
my_dict

{'Logistic Regression': 0.993006993006993,
 'Support Vector Machine': 0.9790209790209791,
 'Naive Bayes': 0.9230769230769231,
 'Random Forest': 0.9440559440559441,
 'Gradient Boosting': 0.9370629370629371,
 'Decision Tree': 0.8951048951048951,
 'Neural Network (MLP)': 0.965034965034965}

In [18]:
'''
import pickle

# Find the index of the model with the highest R-squared score
best_model_index = accuracy_list.index(max(accuracy_list))

# Get the best model from the list of trained models
best_model = list(models.values())[best_model_index]

# Save the best model to a file using pickle
with open('bestt_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

# Loading the best model
with open('bestt_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file) 

'''
print()




In [19]:
# Create a dictionary of classifiers and their respective hyperparameter grids for tuning
classifiers = {
    "Logistic Regression": {
        "model": LogisticRegression(),
        "params": {
            "penalty": ["l1", "l2"],
            "C": [0.01, 0.1, 1, 10]
        }
    },
    "Support Vector Machine": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"]
        }
    },
    "Naive Bayes": {
        "model": GaussianNB(),
        "params": {}
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 4, 5]
        }
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "Neural Network (MLP)": {
        "model": MLPClassifier(),
        "params": {
            "hidden_layer_sizes": [(50, 50), (100, 50), (100, 100)],
            "alpha": [0.0001, 0.001, 0.01],
            "max_iter": [1000]
        }
    }
}

In [20]:
from sklearn.model_selection import GridSearchCV
best_params_list = []
# Perform hyperparameter tuning and evaluation for each classifier
for name, clf_info in classifiers.items():
    model = clf_info["model"]
    params = clf_info["params"]

    grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Make predictions with the best model
    y_pred = best_model.predict(X_test)

    # Evaluate the model and store results
    accuracy, confusionmat, report = evaluate_model(y_test, y_pred)
    
    # Append best params and accuracy to lists
    best_params_list.append((name, best_params))
    accuracy_list.append((name, accuracy))

    print(f"Classifier: {name}")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("=" * 50)

Classifier: Logistic Regression
Best Parameters: {'C': 1, 'penalty': 'l2'}
Accuracy: 0.993006993006993
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        53
           1       0.99      1.00      0.99        90

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143

Classifier: Support Vector Machine
Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Accuracy: 0.9790209790209791
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        53
           1       0.97      1.00      0.98        90

    accuracy                           0.98       143
   macro avg       0.98      0.97      0.98       143
weighted avg       0.98      0.98      0.98       143

Classifier: Naive Bayes
Best Parameters: {}
Accuracy: 0.9230769230769231
Classificati

In [21]:
# Print the best parameters and accuracy for each classifier
print("Best Parameters for Each Classifier:")
for name, best_params in best_params_list:
    print(f"Classifier: {name}")
    print(f"Best Parameters: {best_params}")
    print("=" * 50)


Best Parameters for Each Classifier:
Classifier: Logistic Regression
Best Parameters: {'C': 1, 'penalty': 'l2'}
Classifier: Support Vector Machine
Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Classifier: Naive Bayes
Best Parameters: {}
Classifier: Random Forest
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Classifier: Gradient Boosting
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Classifier: Decision Tree
Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 5}
Classifier: Neural Network (MLP)
Best Parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'max_iter': 1000}
