# Imports 

In [None]:
import pandas as pd
import numpy as np
import joblib
import sys
import os
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import r_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline 
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.base import clone
from typing import Any, Callable, Dict, List, Tuple, Union
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

## Load the data

In [None]:
def load_data(path):
        if not os.path.isfile(path):
            raise FileNotFoundError(f"The file at {path} was not found.")
        return pd.read_csv(path)

path_to_data="/home/user_stel/Assignment-2/data/breast_cancer.csv"
data_df=load_data(path_to_data)

#print(data_df.head()) #it should display a 512x32 dataframe

## Preprocessing

In [None]:
def preprocess_data(df, columns_to_drop=[]):
    df=df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    num_list=df.select_dtypes(include=[np.number]).columns.tolist()
    cat_list=df.select_dtypes(exclude=[np.number]).columns.tolist()

    for col in cat_list:
        df[col]=LabelEncoder().fit_transform(df[col])

    for col in num_list:
        df[col]=SimpleImputer(missing_values=np.nan, strategy='mean') \
            .fit_transform(df[[col]]).ravel()
    
    return df

data_new_df=preprocess_data(data_df, columns_to_drop=[])

In [None]:
def separate_features_target(df, target, columns_to_remove=None):
    if columns_to_remove is None:
        columns_to_remove=[]
    columns_to_remove=set(columns_to_remove + [target])
    X=df.drop(columns=[col for col in columns_to_remove if col in df.columns])
    y=df[target]
    return X, y

X, y=separate_features_target(data_new_df, target='diagnosis', columns_to_remove=None)
#print(X)
#print(y)

In [None]:
def select_features(X, y, threshold=0.1):
    correlations = pd.Series(r_regression(X, y), index=X.columns)
    selected_features = correlations[correlations.abs() >= threshold].index.tolist()
    print(f"The selected features of {X.shape[1]} were: {len(selected_features)}")
    return selected_features, correlations

selected_features, correlations=select_features(X, y, threshold=0.5)
print(selected_features)

# Creates a new dataset that contains only the selected features 
X_selected=X[selected_features]
#print(X_selected)

selected_feature_names = X_selected.columns.tolist()
target = 'diagnosis'
data_selected_df = data_new_df[selected_feature_names + [target]]
#print(data_selected_df) # the way this new dataframe is built the diagnosis column is last

In [None]:
models = {
    'LogisticRegression-elasticnet': LogisticRegression(
        penalty='elasticnet', solver='saga', random_state=0, max_iter=10000
    ),
    'GaussianNB': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVC': SVC(random_state=0),
    'RandomForest': RandomForestClassifier(random_state=0),
    'LightGBM': lgb.LGBMClassifier(random_state=0)
}

param_grid = {
    'LogisticRegression-elasticnet': {
        'C': [0.01, 0.1, 1, 10],
        'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]
    },
    'GaussianNB': {
        'var_smoothing': np.logspace(-9, -1, 9)
    },
    'LDA': [
        {'solver': ['svd']},
        {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto']}
    ],
    'SVC': [
        {'kernel': ['linear'], 'C': [0.1, 1, 10]},
        {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto', 0.01, 0.1]}
    ],
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'LightGBM': {
        'n_estimators': [100, 200],
        'num_leaves': [31, 50, 100],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [-1, 10, 20]
    }
}

In [None]:
# Slightly modified generate_param_combinations function from previous assignment because for this project the param_grid conains lists of dictionaries 
# for two separet models 
def generate_param_combintions(param_grid):
    model_combinations = {}
    for model, params in param_grid.items():
        
        if isinstance(params, list):
            param_combinations = []
            for param_set in params:
                param_combinations.extend(
                    [dict(zip(param_set.keys(), values)) for values in itertools.product(*param_set.values())]
                )
            model_combinations[model] = param_combinations
        else:
            model_combinations[model] = [
                dict(zip(params.keys(), values)) for values in itertools.product(*params.values())
            ]
    return model_combinations

model_combinations=generate_param_combintions(param_grid)
model_combo_df_summary = pd.DataFrame.from_dict(model_combinations, orient='index')
#print(model_combo_df_summary)

In [None]:
print(data_selected_df)

In [None]:
X, y=separate_features_target(data_selected_df, target='diagnosis', columns_to_remove=None)
#print(X)

X_train, x_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

print(X_train)

In [None]:
def ncv_model(df, models, param_grid, n_rounds, outer_cv, inner_cv, columns_to_remove=[]):
    X, y=separate_features_target(df, target='diagnosis', columns_to_remove=columns_to_remove)

    all_f1_scores = []
    
    for i in range(n_rounds):
        X_train, x_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)
        
        X.columns = X.columns.str.replace(' ', '_')

        for model_name, model in models.items():
            model_param_grid=param_grid[model_name]

            gs = GridSearchCV(estimator=model, param_grid=model_param_grid,
                     cv=inner_cv, scoring='accuracy', n_jobs=1, refit=True)
            
            scaler=StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
            gs.fit(X_train_scaled_df, y_train)

            # Get the best model after hyperparameter tuning
            best_model = gs.best_estimator_

            # Use the best model to make predictions on the outer test set
            x_test_scaled = scaler.transform(x_test)  # Scale the outer test set with the same scaler
            x_test_scaled_df = pd.DataFrame(x_test_scaled, columns=X_train.columns)
            y_pred = best_model.predict(x_test_scaled_df)

            # Calculate the F1 score on the outer test set
            f1 = f1_score(y_test, y_pred, average='macro')
            all_f1_scores.append(f1)
            
            print(f"Model: {model_name}")
            print(f"Outer Test Set F1 Score: {f1:.3f}")
            print('-' * 50)
    
    return all_f1_scores
    
results=ncv_model(data_selected_df, models=models, param_grid=param_grid, n_rounds=10, outer_cv=5, inner_cv=3, columns_to_remove=None)