In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm

In [2]:
cancer_dataset = pd.read_csv('mammographic_masses.data', na_values=['?'], names=['BI-RADS', 'age', 'shape', 'margin', 'density', 'severity'])
cancer_dataset.head(10)

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
8,5.0,57.0,1.0,5.0,3.0,1
9,5.0,60.0,,5.0,1.0,1


In [3]:
cancer_dataset.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,45.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [4]:
cancer_dataset.dropna(inplace=True)
cancer_dataset.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [5]:
X = cancer_dataset.drop(columns = ['BI-RADS', 'severity' ], axis=1)
Y = cancer_dataset['severity']

In [6]:
scaler = StandardScaler()

In [7]:
X = scaler.fit_transform(X)
X

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [11]:
def train_and_evaluate_model(classifier, X_train, Y_train, X_test, Y_test):
    classifier.fit(X_train, Y_train)  

    X_train_prediction = classifier.predict(X_train)
    training_accuracy = accuracy_score(
        Y_train, X_train_prediction
    )  

    X_test_prediction = classifier.predict(X_test)  
    testing_accuracy = accuracy_score(
        Y_test, X_test_prediction
    )  
    
    return training_accuracy, testing_accuracy, classifier


In [None]:
def random_forest_classifier(X_train, Y_train, X_test, Y_test):
    param_grid = {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    } 

    classifier = RandomForestClassifier(
        random_state=2
    )  
    grid_search = GridSearchCV(
        estimator=classifier,
        param_grid=param_grid,
        cv=5,
        n_jobs=-1,
        verbose=2,
        scoring="accuracy",
    ) 
    grid_search.fit(X_train, Y_train)

    best_classifier = grid_search.best_estimator_ 

    return train_and_evaluate_model(best_classifier, X_train, Y_train, X_test, Y_test)


In [19]:
def svm_classifier(X_train, Y_train, X_test, Y_test):
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'], 
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  
        'degree': [2, 3, 4]             
    }

    classifier = svm.SVC()
    
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', verbose=2)
    grid_search.fit(X_train, Y_train)
    
    best_classifier = grid_search.best_estimator_

    return train_and_evaluate_model(best_classifier, X_train, Y_train, X_test, Y_test)


In [None]:
def gradient_boosting_classifier(X_train, Y_train, X_test, Y_test):
    classifier = GradientBoostingClassifier(random_state=2)
    
    return train_and_evaluate_model(classifier, X_train, Y_train, X_test, Y_test)

In [15]:
def evaluate_models(models, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2) 
    
    results = {}
    best_model_name = None
    best_testing_accuracy = 0
    best_model = None
    
    for model_name, model_function in models.items():
        training_accuracy, testing_accuracy, trained_model = model_function(X_train, Y_train, X_test, Y_test) 
        results[model_name] = {
            "Training Accuracy": training_accuracy,
            "Testing Accuracy": testing_accuracy
        }
        
        if testing_accuracy > best_testing_accuracy:
            best_testing_accuracy = testing_accuracy
            best_model_name = model_name
            best_model = trained_model
    
    return results, best_model_name, best_model

In [16]:
models = {
    "Random Forest": random_forest_classifier,
    "SVM": svm_classifier,
    "Gradient Boosting": gradient_boosting_classifier,
}


In [20]:
results, best_model_name, best_model = evaluate_models(models, X, Y)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.2s
[CV] END max_de

In [22]:
for model_name, accuracies in results.items():
    print(
        f"{model_name}: Training Accuracy = {accuracies['Training Accuracy']:.4f}, Testing Accuracy = {accuracies['Testing Accuracy']:.4f}"
    )

print(f"\nBest model based on testing accuracy: {best_model_name}")


Random Forest: Training Accuracy = 0.8253, Testing Accuracy = 0.8133
SVM: Training Accuracy = 0.7952, Testing Accuracy = 0.8193
Gradient Boosting: Training Accuracy = 0.8479, Testing Accuracy = 0.8133

Best model based on testing accuracy: SVM
