# Imports 

In [1]:
import pandas as pd
import numpy as np
import joblib
import sys
import os
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import r_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline 
from sklearn.pipeline import make_pipeline
from sklearn.metrics import matthews_corrcoef, roc_auc_score, balanced_accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc, confusion_matrix
from sklearn.base import clone
from sklearn.svm import SVC

In [2]:
PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [3]:
from src.functions import NestedCrossVal, Classifier 
classifier=Classifier()
ncv=NestedCrossVal()

# Load the Data 

In [4]:
path_to_data="/home/user_stel/Assignment-2/data/breast_cancer.csv"
data_df=classifier.load_data(path_to_data)

#print(data_df.head()) #it should display a 512x32 dataframe

# Preprocessing

In [5]:
data_new_df=classifier.preprocess_data(data_df, columns_to_drop=None)

# Feature Selection

In [6]:
X, y=classifier.separate_features_target(data_new_df, target='diagnosis', columns_to_remove=None)
#print(X)
#print(y)

In [7]:
selected_features, correlations=classifier.select_features(X, y, threshold=0.5)
print(selected_features)

# Creates a new dataset that contains only the selected features 
X_selected=X[selected_features]
#print(X_selected)

selected_feature_names = X_selected.columns.tolist()
target = 'diagnosis'
data_selected_df = data_new_df[selected_feature_names + [target]]
#print(data_selected_df) # the way this new dataframe is built the diagnosis column is last

The selected features of 31 were: 15
['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'perimeter_worst', 'area_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst']


### Model initialization 

In [7]:
model=SVC(random_state=0)
svc_param_grid=[
        {'kernel': ['linear'], 'C': [0.1, 1, 10]},
        {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto', 0.01, 0.1]}
    ]

In [8]:
svc_param_combinations=ncv.generate_param_combinations(param_grid=svc_param_grid)
svc_combo_df_summary = pd.DataFrame.from_dict(svc_param_combinations, orient='index')
print(svc_combo_df_summary)

                                        0                             1   \
__winner__  {'kernel': 'linear', 'C': 0.1}  {'kernel': 'linear', 'C': 1}   

                                       2   \
__winner__  {'kernel': 'linear', 'C': 10}   

                                                       3   \
__winner__  {'kernel': 'rbf', 'C': 0.1, 'gamma': 'scale'}   

                                                      4   \
__winner__  {'kernel': 'rbf', 'C': 0.1, 'gamma': 'auto'}   

                                                    5   \
__winner__  {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.01}   

                                                   6   \
__winner__  {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.1}   

                                                     7   \
__winner__  {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'}   

                                                    8   \
__winner__  {'kernel': 'rbf', 'C': 1, 'gamma': 'auto'}   

                                            

# Hyperparameter Tuning using 5-fold Cross Validation

In [12]:
svc_results=classifier.model_tuning(SVC, X, y, param_grid=svc_param_grid, cv=5)

[SVC] Tested params: {'kernel': 'linear', 'C': 0.1}
[SVC] AUC: 0.9884
[SVC] New best AUC: 0.9884
[SVC] Best params so far: {'kernel': 'linear', 'C': 0.1}
[SVC] Tested params: {'kernel': 'linear', 'C': 1}
[SVC] AUC: 0.9873
[SVC] Tested params: {'kernel': 'linear', 'C': 10}
[SVC] AUC: 0.9873
[SVC] Tested params: {'kernel': 'rbf', 'C': 0.1, 'gamma': 'scale'}
[SVC] AUC: 0.9606
[SVC] Tested params: {'kernel': 'rbf', 'C': 0.1, 'gamma': 'auto'}
[SVC] AUC: 0.5296
[SVC] Tested params: {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.01}
[SVC] AUC: 0.7013
[SVC] Tested params: {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.1}
[SVC] AUC: 0.5000
[SVC] Tested params: {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'}
[SVC] AUC: 0.9669
[SVC] Tested params: {'kernel': 'rbf', 'C': 1, 'gamma': 'auto'}
[SVC] AUC: 0.5358
[SVC] Tested params: {'kernel': 'rbf', 'C': 1, 'gamma': 0.01}
[SVC] AUC: 0.7303
[SVC] Tested params: {'kernel': 'rbf', 'C': 1, 'gamma': 0.1}
[SVC] AUC: 0.5000
[SVC] Tested params: {'kernel': 'rbf', 'C': 10, 'gamm

In [13]:
best_model  = svc_results['Best Model']
best_params = svc_results['Best Params']
best_auc    = svc_results['Best AUC']

# print them
print(f"Best AUC:   {best_auc:.4f}")
print(f"Best Model: {best_model}")
print(f"Best Params: {best_params}")

Best AUC:   0.9884
Best Model: SVC(C=0.1, kernel='linear')
Best Params: {'kernel': 'linear', 'C': 0.1}


# Training 

In [10]:
classifier.train_final_model(SVC(kernel='linear', C=0.1), X, y, save_path="./winner_model.pkl", scale=True)

Final model trained on all data and saved to ./winner_model.pkl


In [16]:
svc_auc=classifier.train_tuned_model(SVC(kernel='linear', C=0.1), X, y, scale=True)

Model: SVC AUC: 0.9861
