# Imports 

In [1]:
import pandas as pd
import numpy as np
import joblib
import sys
import os
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import r_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline 
from sklearn.pipeline import make_pipeline
from sklearn.metrics import matthews_corrcoef, roc_auc_score, balanced_accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve, auc, confusion_matrix
from sklearn.base import clone
from sklearn.svm import SVC

In [2]:
PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [3]:
from src.functions import NestedCrossVal
ncv=NestedCrossVal()

# Load the Data 

In [4]:
def load_data(path):
        if not os.path.isfile(path):
            raise FileNotFoundError(f"The file at {path} was not found.")
        return pd.read_csv(path)

path_to_data="/home/user_stel/Assignment-2/data/breast_cancer.csv"
data_df=load_data(path_to_data)

#print(data_df.head()) #it should display a 512x32 dataframe

# Preprocessing

In [5]:
def preprocess_data(df, columns_to_drop=[]):
    df=df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    num_list=df.select_dtypes(include=[np.number]).columns.tolist()
    cat_list=df.select_dtypes(exclude=[np.number]).columns.tolist()

    for col in cat_list:
        df[col]=LabelEncoder().fit_transform(df[col])

    for col in num_list:
        df[col]=SimpleImputer(missing_values=np.nan, strategy='mean') \
            .fit_transform(df[[col]]).ravel()
    
    return df

data_new_df=preprocess_data(data_df, columns_to_drop=[])

# Feature Selection

In [6]:
def separate_features_target(df, target, columns_to_remove=None):
    if columns_to_remove is None:
        columns_to_remove=[]
    columns_to_remove=set(columns_to_remove + [target])
    X=df.drop(columns=[col for col in columns_to_remove if col in df.columns])
    y=df[target]
    return X, y

X, y=separate_features_target(data_new_df, target='diagnosis', columns_to_remove=None)
#print(X)
#print(y)

In [7]:
def select_features(X, y, threshold=0.1):
    correlations = pd.Series(r_regression(X, y), index=X.columns)
    selected_features = correlations[correlations.abs() >= threshold].index.tolist()
    print(f"The selected features of {X.shape[1]} were: {len(selected_features)}")
    return selected_features, correlations

selected_features, correlations=select_features(X, y, threshold=0.5)
print(selected_features)

# Creates a new dataset that contains only the selected features 
X_selected=X[selected_features]
#print(X_selected)

selected_feature_names = X_selected.columns.tolist()
target = 'diagnosis'
data_selected_df = data_new_df[selected_feature_names + [target]]
#print(data_selected_df) # the way this new dataframe is built the diagnosis column is last

The selected features of 31 were: 15
['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'perimeter_worst', 'area_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst']


### Model initialization 

In [8]:
model=SVC(random_state=0)
svc_param_grid=[
        {'kernel': ['linear'], 'C': [0.1, 1, 10]},
        {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto', 0.01, 0.1]}
    ]

In [9]:
svc_param_combinations=ncv.generate_param_combinations(param_grid=svc_param_grid)
svc_combo_df_summary = pd.DataFrame.from_dict(svc_param_combinations, orient='index')
print(svc_combo_df_summary)

                                        0                             1   \
__winner__  {'kernel': 'linear', 'C': 0.1}  {'kernel': 'linear', 'C': 1}   

                                       2   \
__winner__  {'kernel': 'linear', 'C': 10}   

                                                       3   \
__winner__  {'kernel': 'rbf', 'C': 0.1, 'gamma': 'scale'}   

                                                      4   \
__winner__  {'kernel': 'rbf', 'C': 0.1, 'gamma': 'auto'}   

                                                    5   \
__winner__  {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.01}   

                                                   6   \
__winner__  {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.1}   

                                                     7   \
__winner__  {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'}   

                                                    8   \
__winner__  {'kernel': 'rbf', 'C': 1, 'gamma': 'auto'}   

                                            

# Hyperparameter Tuning using 5-fold Cross Validation