In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from B_0_1_Transformer_ColumnSelector import ColumnSelector
from B_0_3_ThresholdClassifier import ThresholdedClassifier
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
import warnings

In [29]:
with open('strokes_split_data.pkl', 'rb') as f:
    df = pickle.load(f)

X_train = df["X_train"]
Y_train = df["Y_train"]

In [30]:
' ----------------------Steuerung: verwendete Spalten für ColumnSelector --------------------'
run_model_on_these_column = [
    'age'
    # ,'gender'
    #  ,'hypertension'
    #  ,'heart_disease'
    #  ,'Residence_type'
    #  ,'avg_glucose_level'
    #  ,'bmi'
    # , 'smoking_status'
    #  ,'age_above_60'
    #  , 'high_glucose'
    # , 'did_smoke'
    # , 'heart_risk'
    , 'at_least_one_risk'
    # , 'at_least_one_risk_and_high_age'
    # , 'all_risks'
    #   , 'risk_sum'
    ]

In [31]:
' ----------------------Steuerung: Soll thresholded Classifier verwendet werden? --------------------'
thresholded_classifier = 1
threshold = 0.18

In [32]:
' ################# Hier Parameter initiale Modellsteuerung setzen (für Gridsearch irrelevant) ####################'
"----------------------- Modellparameter ----------------------"
C = 0.005  
classweight_y = 10
gamma = 0.1
' ############# Modellinstanz #########'
svc = SVC(C=C  # erhöhen macht grenzen schärfer!
          , kernel='rbf'  # wir verwenden immer rbf
          , degree=3   # nicht relevant für rbf
          , gamma= gamma  # relevant. Große werte machen kleine Bereiche und andersrum!
          , coef0=0.0  # nicht relevant für rbf
          , shrinking=True  # wirkt sich auf Laufzeit abh. v. Iterationszahl aus. Qualitativ hier nicht relevant
          , probability=True # steuert, ob Wahrscheinlichkeiten ausgegeben werden!
          , tol=0.001  # Toleranz stopping criterion
          , cache_size=200  # prozessorparameter
          , class_weight={0: 1, 1: classweight_y}  # WICHTIG
          , verbose=False  # printing parameter
          , max_iter=-1  # iterationen. -1 = unbeschränkt
          , decision_function_shape='ovo'  # ovr= one vs. rest, ovo = one vs. one
          , break_ties=False # für ovr wichtig
          , random_state=42)

In [33]:
' ############### Instanz Scaler #############'
standard_scaler = StandardScaler()
' ###################### Instanz Spaltentransformator ##############'
column_selector = ColumnSelector(columns= run_model_on_these_column)
' ###################### Instanz Thresholded Classifier ############'
thresholded_svc = ThresholdedClassifier(classifier=svc, threshold=threshold)

In [34]:
' ################ Pipeline-Selektion aufgrund von Steuerung ganz oben ####'
if thresholded_classifier == 1:
    ' ##################### Pipeline mit Threshold-Classifier ###########'
    pipeline_thresholded_classifier = Pipeline([
        ("columnselector", column_selector),
        ("scaler", standard_scaler),
        ("thresholdclassifier", thresholded_svc),
    ])
    chosen_pipeline = pipeline_thresholded_classifier
else:
    ' ##################### Pipeline mit normalem Classifier ###########'
    pipeline_regular_classifier = Pipeline([
        ("columnselector", column_selector),
        ("scaler", standard_scaler),
        ("regularclassifier", svc),
    ])
    chosen_pipeline = pipeline_regular_classifier
' ################ Fitting ###########'
chosen_pipeline.fit(X_train, Y_train)

In [106]:
############# scope of gridsearch einstellen ###############
scope_of_gridsearch = "fine"
############################################################
if scope_of_gridsearch == "rough": ####### liefert grobe Fläche ##########
    # Liste mit Klassengewichtungen erzeugen
    column_combinations = ["age", "at_least_one_risk"] 
    y_weights = list(range(1, 3, 1))
    list_weight_classes = [{0: 1, 1: value} for value in y_weights]
    list_of_C_values = np.arange(0.005, 1.05, 0.1) # weniger feines C-grid, erste Berechnung
    list_of_gamma_values = [0.01,0.05,0.1, 0.15,0.2]   # weniger feiner gamma-grid, zweite berechnung
    thresholds = np.arange(0.02, 0.49, 0.01)   
    grid_search_name = "NOT_CONDUCTED_YET_ROUGH.pkl"
elif scope_of_gridsearch == "fine": ############### verfeinert ######
    # Liste mit Klassengewichtungen erzeugen
    column_combinations = [["age", "at_least_one_risk"], ["age", "risk_sum"]]
    y_weights = list(range(1, 3, 1))
    list_weight_classes = [{0: 1, 1: value} for value in y_weights]
    list_of_C_values = np.arange(0.005, 1.05, 1)
    list_of_gamma_values = [0.5, 1]
    thresholds = np.arange(0.02, 0.03, 0.005)   
    grid_search_name = "gridsearch_multistep_pipeline.pkl"

print("columns: ", len(columns))
print("thresholds: ", len(thresholds))
print("weights: ", len(list_weight_classes))
print("C: ", len(list_of_C_values))
print("gamma: ", len(list_of_gamma_values))


if chosen_pipeline == pipeline_thresholded_classifier:
    grid_parameters = {
                "columnselector__columns": column_combinations
                , "thresholdclassifier__threshold": thresholds
                , "thresholdclassifier__classifier__C": list_of_C_values
                , "thresholdclassifier__classifier__gamma": list_of_gamma_values  
                , "thresholdclassifier__classifier__class_weight": list_weight_classes
                      }
else:
    grid_parameters = {
                "columnselector__columns": column_combinations
                , "regularclassifier__C": list_of_C_values
                , "regularclassifier__gamma": list_of_gamma_values  # relevant. Große werte machen kleine Bereiche und andersrum!
                , "regularclassifier__class_weight": list_weight_classes
                      }
print(chosen_pipeline)

columns:  2
thresholds:  2
weights:  2
C:  2
gamma:  2
Pipeline(steps=[('columnselector',
                 ColumnSelector(columns=['age', 'at_least_one_risk'])),
                ('scaler', StandardScaler()),
                ('thresholdclassifier',
                 ThresholdedClassifier(classifier=SVC(C=0.005,
                                                      class_weight={0: 1,
                                                                    1: 10},
                                                      decision_function_shape='ovo',
                                                      gamma=0.1,
                                                      probability=True,
                                                      random_state=42),
                                       threshold=0.18))])


In [107]:
gridsearch = GridSearchCV(chosen_pipeline
                          , grid_parameters
                          , scoring=["recall", "precision"]
                          , cv=6
                          , return_train_score=True
                          , refit ="recall")

In [108]:
# Warnungen ignorieren
warnings.filterwarnings("ignore")

In [109]:
print(datetime.now())

2025-01-17 17:22:30.507161


In [110]:
gridsearch.fit(X_train, Y_train)

In [111]:
print(datetime.now())

2025-01-17 17:23:02.375132


In [112]:
############## Ergebnisse rausziehen #############
final_model=gridsearch.best_estimator_
cv_res = pd.DataFrame(gridsearch.cv_results_)
print(cv_res.keys())
cv_res_important_values = cv_res[["mean_test_recall", "mean_train_recall", 'mean_test_precision','mean_train_precision',"param_thresholdclassifier__classifier__C", 
                                  "param_thresholdclassifier__classifier__gamma", "param_thresholdclassifier__classifier__class_weight",
                                 "param_thresholdclassifier__threshold","param_columnselector__columns"]]
pd.set_option('display.max_rows', None)  # Zeilen: Zeige alle
pd.set_option('display.max_columns', None)  # Spalten: Zeige alle
ergebnisse_sortiert = cv_res_important_values.sort_values(by="mean_test_recall", ascending=False)
print(grid_search_name)

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_columnselector__columns',
       'param_thresholdclassifier__classifier__C',
       'param_thresholdclassifier__classifier__class_weight',
       'param_thresholdclassifier__classifier__gamma',
       'param_thresholdclassifier__threshold', 'params', 'split0_test_recall',
       'split1_test_recall', 'split2_test_recall', 'split3_test_recall',
       'split4_test_recall', 'split5_test_recall', 'mean_test_recall',
       'std_test_recall', 'rank_test_recall', 'split0_train_recall',
       'split1_train_recall', 'split2_train_recall', 'split3_train_recall',
       'split4_train_recall', 'split5_train_recall', 'mean_train_recall',
       'std_train_recall', 'split0_test_precision', 'split1_test_precision',
       'split2_test_precision', 'split3_test_precision',
       'split4_test_precision', 'split5_test_precision', 'mean_test_precision',
       'std_test_precision', 'rank_test_precision', 'split0

In [113]:
ergebnisse_sortiert

Unnamed: 0,mean_test_recall,mean_train_recall,mean_test_precision,mean_train_precision,param_thresholdclassifier__classifier__C,param_thresholdclassifier__classifier__gamma,param_thresholdclassifier__classifier__class_weight,param_thresholdclassifier__threshold,param_columnselector__columns
0,1.0,1.0,0.1195,0.119467,0.005,0.5,"{0: 1, 1: 1}",0.02,"[age, at_least_one_risk]"
1,1.0,1.0,0.1195,0.119467,0.005,0.5,"{0: 1, 1: 1}",0.025,"[age, at_least_one_risk]"
2,1.0,1.0,0.1195,0.119467,0.005,1.0,"{0: 1, 1: 1}",0.02,"[age, at_least_one_risk]"
3,1.0,1.0,0.1195,0.119467,0.005,1.0,"{0: 1, 1: 1}",0.025,"[age, at_least_one_risk]"
4,1.0,1.0,0.1195,0.119467,0.005,0.5,"{0: 1, 1: 2}",0.02,"[age, at_least_one_risk]"
5,1.0,1.0,0.1195,0.119467,0.005,0.5,"{0: 1, 1: 2}",0.025,"[age, at_least_one_risk]"
9,1.0,1.0,0.1195,0.119467,1.005,0.5,"{0: 1, 1: 1}",0.025,"[age, at_least_one_risk]"
8,1.0,1.0,0.1195,0.119467,1.005,0.5,"{0: 1, 1: 1}",0.02,"[age, at_least_one_risk]"
12,1.0,1.0,0.1195,0.119467,1.005,0.5,"{0: 1, 1: 2}",0.02,"[age, at_least_one_risk]"
13,1.0,1.0,0.1195,0.119467,1.005,0.5,"{0: 1, 1: 2}",0.025,"[age, at_least_one_risk]"


In [114]:
############ Ergebnisse aktuell ausgewählte Gridsearch abspeichern ###########
new_grid_search = {'Ergebnisse': ergebnisse_sortiert,
                'final_model': final_model,
                    "grid_parameters": grid_parameters}
with open(grid_search_name, 'wb') as data:
    # Einpacken von allen vorbereiteten Daten
    pickle.dump(new_grid_search, data, pickle.HIGHEST_PROTOCOL)