# Packages


In [None]:
import os
import datetime

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,ParameterGrid
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder


from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import(
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier
)
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,ConfusionMatrixDisplay,roc_auc_score,RocCurveDisplay,precision_score

from imblearn.over_sampling import RandomOverSampler,SMOTENC

import mlflow
import mlflow.sklearn
import os
import datetime
import pickle

# Functions

In [None]:
def split_in_feature(dataset,target):
    """
    Split dataset between features and target
    """
    y=dataset[target]
    X=dataset.drop(target,axis=1)

    return X,y

def split_dataset(X,y):
    """
    Split dataset in three:
    Test, Train, Validation
    """
    X_train,X_to_test,y_train,y_to_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=180)
    X_to_fit,X_to_val,y_to_fit,y_to_val=train_test_split(X_train,y_train,test_size=0.5,stratify=y_train,random_state=180)
    
    return X_to_test,X_to_fit,X_to_val,y_to_test,y_to_fit,y_to_val

def preprocessing_features(X_to_fit):
    """
    Depending of feature type its receive a type of preprocessor
    """
    categorical_columns = X_to_fit.select_dtypes(include=['object']).columns.to_list()
    numerical_columns = ['age']



    num_pipeline= Pipeline(
                    steps=[
                        ('imputer',SimpleImputer(strategy='median')),
                        ('scaler',StandardScaler(with_mean=False))
                    ]
                )

    cat_pipeline=Pipeline(
                    steps=[
                        ('imputer',SimpleImputer(strategy='most_frequent')),
                        ('one_hot_enconder',OneHotEncoder())                        
                    ]
                )

    preprocessor=ColumnTransformer(
                    [
                        ('num_pipeline',num_pipeline,numerical_columns),
                        ('cat_pipeline',cat_pipeline,categorical_columns)
                    ]
                )

    preprocessor.fit(X_to_fit)
    return preprocessor

def preprocessing_target(y_to_fit):
    """
    Label Enconder target variables
    """
    y_labels=LabelEncoder().fit(y_to_fit)
    print(y_labels.classes_)
    return y_labels

def mlflow_tracking_training(experimental_name,X_test,X_fit,X_val,y_test,y_fit,y_val):
    """
    Tracking and save mlflow model
    """
    caminho_pasta=f'{experimental_name}_{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'
    os.makedirs(caminho_pasta, exist_ok=True)
    os.environ["MLFLOW_ARTIFACT_ROOT"] = caminho_pasta
    print(caminho_pasta)
    mlflow.set_experiment(experimental_name)
    
    for i in range(len(list(models))):
        model=list(models.values())[i]
        model_name=list(models.keys())[i]
            
        print(f'--------------------Trainning {model_name} Model-------------------------------')
        model_params = params.get(model_name, {})
        param_grid = list(ParameterGrid(model_params))
        loop=0
        for params_combination in param_grid:
            loop +=1
            try:
                with mlflow.start_run():
                    mlflow.log_param("model", model_name)
                    mlflow.log_params(params_combination)
                    model.set_params(**params_combination)                  
                    model.fit(X_fit,y_fit)               
                    train_acc=model.score(X_fit,y_fit)
                    mlflow.sklearn.log_model(model,f"{model_name}")
                    print(f'{model_name} Model Train Accuracy: {train_acc:.2f}')
                    mlflow.log_metric("train_acc", train_acc)

                    print(f'--------------------Vailidating {model_name} Model-------------------------------')

                    pred_val=model.predict(X_val)
                    acc_score_validation =accuracy_score(y_val, pred_val)            
                    mlflow.log_metric("Validation Accuracy Score",acc_score_validation)
                    print(f"Validation Dataset Accuracy: {acc_score_validation:0.2f}")

                    report_validation = classification_report(y_val, pred_val, output_dict=True)
                    df_report_validation = pd.DataFrame(report_validation).transpose()
                

                    artifact_path=os.path.join(caminho_pasta,f'report_validation_{loop}_{model_name}.csv')
                    df_report_validation.to_csv(artifact_path)
                    mlflow.log_artifact(artifact_path)

                    auc_score_valid = roc_auc_score(y_val,pred_val)
                    mlflow.log_metric("Validation AUC Score",auc_score_valid)  

                    
                    display=RocCurveDisplay.from_predictions(y_val,pred_val)
                    plt.title("Validation")
                    plt.show()
                    artifact_path=os.path.join(caminho_pasta,f"{model_name}_{loop}_validation_roc_curve.png")
                    display.plot()
                    plt.savefig(artifact_path)
                    mlflow.log_artifact(artifact_path)

                    cm=confusion_matrix(y_val,pred_val,labels=model.classes_)
                    
                    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
                    fig, ax = plt.subplots(figsize=(8, 6))
                    disp.plot(ax=ax, values_format='d', cmap='GnBu')
                    ax.set_title(f'{model_name}: Validation')

                    artifact_path=os.path.join(caminho_pasta,f"{model_name}_{loop}_validation_conf_matrix.png")
                    plt.savefig(artifact_path)
                    mlflow.log_artifact(artifact_path)

                    mlflow.log_metric(f"Validation Precision Score 0",precision_score(y_val, pred_val, average='binary',pos_label=0))
                    mlflow.log_metric(f"Validation Precision Score 1",precision_score(y_val, pred_val, average='binary',pos_label=1))

                    
                    print(f'--------------------Testing {model_name} Model-------------------------------')
                    pred_test=model.predict(X_test)
                    acc_score_test=accuracy_score(y_test, pred_test)            
                    mlflow.log_metric("Testing Accuracy Score",acc_score_test)
                    print(f"Test Dataset Accuracy: {acc_score_test:0.2f}")

                    report_testing = classification_report(y_test, pred_test, output_dict=True)
                    df_report_testing = pd.DataFrame(report_testing).transpose()

                    artifact_path=os.path.join(caminho_pasta,f'report_testing_{loop}_{model_name}.csv')        
                    df_report_testing.to_csv(artifact_path)
                    mlflow.log_artifact(artifact_path)

                    cm=confusion_matrix(y_test,pred_test,labels=model.classes_)
                    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
                    fig, ax = plt.subplots(figsize=(8, 6))
                    disp.plot(ax=ax, values_format='d', cmap='BuPu')
                    ax.set_title(f'{model_name}: Test')

                    artifact_path=os.path.join(caminho_pasta,f"{model_name}_{loop}_test_conf_matrix.png")
                    plt.savefig(artifact_path)
                    mlflow.log_artifact(artifact_path)
                    

                    mlflow.log_metric(f"Test Precision Score 0",precision_score(y_test, pred_test, average='binary',pos_label=0))
                    mlflow.log_metric(f"Test Precision Score 1",precision_score(y_test, pred_test, average='binary',pos_label=1))


                    auc_score_test = roc_auc_score(y_test,pred_test)
                    mlflow.log_metric("Test AUC Score",auc_score_test)  

                    
                    display=RocCurveDisplay.from_predictions(y_test,pred_test)
                    plt.title("Test")
                    display.plot()
                    artifact_path=os.path.join(caminho_pasta,f"{model_name}_{loop}_test_roc_curve.png")
                    plt.savefig(artifact_path)
                    mlflow.log_artifact(artifact_path)
                    log_print=f'Finish {model_name} with prameter: {params_combination}'
                    print(log_print)
                    mlflow.end_run() 

            except Exception as e:
                    log_print=f'Error in{model_name} with prameter: {params_combination}: {str(e)}'
                    print(log_print)

                
                                                      
                   

# Models

In [None]:
models = {
                "Random_Forest": RandomForestClassifier(random_state=42),
                "Decision_Tree": DecisionTreeClassifier(random_state=42),
                "Gradient_Boosting": GradientBoostingClassifier(random_state=42),
                "Logistic_Regression": LogisticRegression(random_state=42),
                "XGBClassifier": XGBClassifier(),
                "AdaBoost_Classifier": AdaBoostClassifier(random_state=42,),
                "SVM_Classifier":svm.SVC(random_state=42,)
            }
params={
                "Decision_Tree": {
                    'criterion':['gini', 'log_loss', 'entropy'],
                    #'max_features':['auto', 'sqrt', 'log2']
                },
                "Random_Forest":{                    
                    #'n_estimators': [8,16,32,64,128,256],
                    'criterion':['gini','entropy','log_loss'],
                    #'max_features':['sqrt','log2']

                },
                "Gradient_Boosting":{
                    'learning_rate':[.1,.01,.05,.001],
                    #'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    #'n_estimators': [8,16,32,64,128,256]
                },
               "Logistic_Regression":{},
                "XGBClassifier":{
                    'learning_rate':[.1,.01,.05,.001],
                    #'n_estimators': [8,16,32,64,128,256]
                },
                "CatBoosting_Classifier":{
                    'random_state':42,
                    'depth': [6,8,10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost_Classifier":{
                    'learning_rate':[.1,.01,0.5,.001],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "SVM_Classifier":{
                     'C':[1,2,5,10]
                }
                
            }


svm_linear=svm.LinearSVC(random_state=42)
stacking_model = StackingClassifier(estimators=list(models.items()),final_estimator=svm_linear,cv=10,stack_method='auto')
models["Stacking_Classifier"] = stacking_model


In [None]:
for i in range(len(list(models))):
        model_name=list(models.keys())[i]
            
        print(f'--------------------Trainning {model_name} Model-------------------------------')
        model_params = params.get(model_name, {})
        param_grid = list(ParameterGrid(model_params))
        loop=0
        for params_combination in param_grid:
            model=list(models.values())[i]
            model.set_params(**params_combination)                  
            model.fit(X_fit,y_fit)               
            train_acc=model.score(X_fit,y_fit)
            print(f'{model_name} Model Train Accuracy: {train_acc:.2f}')

            print(f'--------------------Vailidating {model_name} Model-------------------------------')
            pred_val=model.predict(X_val)
            acc_score_validation =accuracy_score(y_val, pred_val)            
            print(f"Validation Dataset Accuracy: {acc_score_validation:0.2f}")
            print(classification_report(y_val, pred_val))                         
            auc_score_valid = roc_auc_score(y_val,pred_val)
            print("Validation AUC Score",auc_score_valid) 

                    
            display=RocCurveDisplay.from_predictions(y_val,pred_val)
            plt.title("Validation")
            plt.show()
            display.plot()
            cm=confusion_matrix(y_val,pred_val,labels=model.classes_)                    
            disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
            fig, ax = plt.subplots(figsize=(8, 6))
            disp.plot(ax=ax, values_format='d', cmap='GnBu')
            ax.set_title(f'{model_name}: Validation')

                

            print(f"Validation Precision Score 0",precision_score(y_val, pred_val, average='binary',pos_label=0))
            print(f"Validation Precision Score 1",precision_score(y_val, pred_val, average='binary',pos_label=1))

                    
            print(f'--------------------Testing {model_name} Model-------------------------------')
            pred_test=model.predict(X_test)
            acc_score_test=accuracy_score(y_test, pred_test)           
                    
            print(f"Test Dataset Accuracy: {acc_score_test:0.2f}")

            print(classification_report(y_test, pred_test))
                    

                          
                    

            cm=confusion_matrix(y_test,pred_test,labels=model.classes_)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
            fig, ax = plt.subplots(figsize=(8, 6))
            disp.plot(ax=ax, values_format='d', cmap='BuPu')
            ax.set_title(f'{model_name}: Test')

            

            print(f"Test Precision Score 0",precision_score(y_test, pred_test, average='binary',pos_label=0))
            print(f"Test Precision Score 1",precision_score(y_test, pred_test, average='binary',pos_label=1))


            auc_score_test = roc_auc_score(y_test,pred_test)
            print("Test AUC Score",auc_score_test)  

                    
            display=RocCurveDisplay.from_predictions(y_test,pred_test)
            plt.title("Test")
            display.plot()
                    
                

# Without Balance

In [None]:
df=pd.read_csv('../data/cancer_data_cleaned.csv')


In [None]:
df.columns

In [None]:
X,y=split_in_feature(df,'lung_cancer')

In [None]:
X_to_test,X_to_fit,X_to_val,y_to_test,y_to_fit,y_to_val = split_dataset(X,y)

In [None]:
X,y=split_in_feature(df,'lung_cancer')
X_to_test,X_to_fit,X_to_val,y_to_test,y_to_fit,y_to_val = split_dataset(X,y)
y_labels=preprocessing_target(y_to_fit)
y_test=y_labels.transform(y_to_test)
y_fit=y_labels.transform(y_to_fit)
y_val=y_labels.transform(y_to_val)
y_fit


In [None]:
preprocessor=preprocessing_features(X_to_fit)
X_fit=preprocessor.transform(X_to_fit)
X_test=preprocessor.transform(X_to_test)
X_val=preprocessor.transform(X_to_val)

y_labels=preprocessing_target(y_to_fit)
y_test=y_labels.transform(y_to_test)
y_fit=y_labels.transform(y_to_fit)
y_val=y_labels.transform(y_to_val)
y_fit

In [None]:
mlflow_tracking_training("Imbalance_Models",X_test,X_fit,X_val,y_test,y_fit,y_val)

# With Balance

In [None]:
from imblearn.over_sampling import SMOTE
ros = SMOTE(random_state=42)

X_ros, y_ros = ros.fit_resample(X, y)

X_to_test,X_to_fit,X_to_val,y_to_test,y_to_fit,y_to_val = split_dataset(X_ros,y_ros)
preprocessor=preprocessing_features(X_to_fit)
X_fit=preprocessor.transform(X_to_fit)
X_test=preprocessor.transform(X_to_test)
X_val=preprocessor.transform(X_to_val)

y_labels=preprocessing_target(y_to_fit)
y_test=y_labels.transform(y_to_test)
y_fit=y_labels.transform(y_to_fit)
y_val=y_labels.transform(y_to_val)

In [None]:
len(X_ros),len(y_ros)

In [None]:
mlflow_tracking_training("Balance_SMOTE_Models_with_parameters",X_test,X_fit,X_val,y_test,y_fit,y_val)

# Balance with SMOTENC

In [None]:
models = {
                "Random_Forest": RandomForestClassifier(random_state=42),
                "Decision_Tree": DecisionTreeClassifier(random_state=42),
                "Gradient_Boosting": GradientBoostingClassifier(random_state=42),
                "Logistic_Regression": LogisticRegression(random_state=42),
                "XGBClassifier": XGBClassifier(),
                "AdaBoost_Classifier": AdaBoostClassifier(random_state=42,),
                "SVM_Classifier":svm.SVC(random_state=42,)
            }
params={
                "Decision_Tree": {
                    'criterion':['gini', 'log_loss', 'entropy'],
                    'max_features':['auto', 'sqrt', 'log2'],
                    'max_depth' : [3,5,10]

                },
                "Random_Forest":{                    
                    'n_estimators': [8,16,32],
                    'criterion':['gini','entropy','log_loss'],
                    'max_depth' : [3,5,10],

                },
                "Gradient_Boosting":{
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32],
                    'loss' : ['log_loss'],
                    'max_depth' : [3,5,10],
                },

               "Logistic_Regression":{
                   'penalty' : ['l1', 'l2', 'elasticnet', None],
                    'C':[1,2,5],

               },
                "XGBClassifier":{
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32]
                },
                "AdaBoost_Classifier":{
                    'learning_rate':[.1,.01,0.5,.001],
                    'n_estimators': [8,16,32]
                },
                "SVM_Classifier":{
                     'C':[1,2,5],
                     'kernel' : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
                }
                
            }


svm_linear=svm.LinearSVC(random_state=42)
stacking_model = StackingClassifier(estimators=list(models.items()),final_estimator=svm_linear,cv=10,stack_method='auto')
models["Stacking_Classifier"] = stacking_model


In [None]:
categorical_columns = ['gender', 'smoking', 'yellow_fingers',
                                  'anxiety', 'peer_pressure', 'chronic_disease',
                                  'fatigue', 'allergy', 'wheezing',
                                  'alcohol_consuming', 'coughing',
                                  'shortness_of_breath',
                                  'swallowing_difficulty', 'chest_pain']

In [None]:
df=pd.read_csv('../data/cancer_data_cleaned.csv',)


X,y=split_in_feature(df,'lung_cancer')

ros =SMOTENC(random_state=42,categorical_features=categorical_columns)

X_to_test,X_to_fit,X_to_val,y_to_test,y_to_fit,y_to_val = split_dataset(X,y)

X_ros, y_ros = ros.fit_resample(X_to_fit, y_to_fit)

preprocessor=preprocessing_features(X_ros)
X_fit=preprocessor.transform(X_ros)
X_test=preprocessor.transform(X_to_test)
X_val=preprocessor.transform(X_to_val)

y_labels=preprocessing_target(y_ros)
y_fit=y_labels.transform(y_ros)
y_test=y_labels.transform(y_to_test)
y_val=y_labels.transform(y_to_val)
y_labels.inverse_transform([0,0])

In [None]:
pickle.dump(preprocessor,open('../preprocessor.pkl','wb'))


In [None]:
mlflow_tracking_training("SMOTNC_Balance",X_test,X_fit,X_val,y_test,y_fit,y_val)