In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('../data/Maternal Health Risk Data Set.csv')

# Modelation

## Split

In [None]:
y=df['RiskLevel']
X=df.drop('RiskLevel',axis=1)

In [None]:
X.columns

In [None]:
from sklearn.model_selection import train_test_split
def split_dataset(X,y):
    X_train,X_to_test,y_train,y_to_test=train_test_split(X,y,test_size=0.1,stratify=y,random_state=180)
    X_to_fit,X_to_val,y_to_fit,y_to_val=train_test_split(X_train,y_train,test_size=0.1,stratify=y_train,random_state=180)
    
    return X_to_test,X_to_fit,X_to_val,y_to_test,y_to_fit,y_to_val

## Preprocessing

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler

def preprocessing_features(X_to_fit):
    categorical_columns = X_to_fit.select_dtypes(include=['object']).columns.to_list()
    numerical_columns = X_to_fit.select_dtypes(include=['float64']).columns.to_list()



    num_pipeline= Pipeline(
                    steps=[
                        ('imputer',SimpleImputer(strategy='median')),
                        ('scaler',StandardScaler(with_mean=False))
                    ]
                )

    cat_pipeline=Pipeline(
                    steps=[
                        ('imputer',SimpleImputer(strategy='most_frequent')),
                        ('one_hot_enconder',OneHotEncoder()),
                        ('scaler',StandardScaler(with_mean=False))
                    ]
                )

    preprocessor=ColumnTransformer(
                    [
                        ('num_pipeline',num_pipeline,numerical_columns),
                        ('cat_pipeline',cat_pipeline,categorical_columns)
                    ]
                )

    preprocessor.fit(X_to_fit)
    return preprocessor

In [None]:
from sklearn.preprocessing import LabelEncoder

def preprocessing_target(y_to_fit):
    y_labels=LabelEncoder().fit(y_to_fit)
    print(y_labels.classes_)
    return y_labels


X_test=preprocessor.transform(X_to_test)
y_test=y_labels.transform(y_to_test.values.reshape(-1, 1))   

X_val=preprocessor.transform(X_to_val)
y_val=y_labels.transform(y_to_val.values.reshape(-1, 1))   


## Training

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import(
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier
)
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,ConfusionMatrixDisplay
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
models = {
                "Random_Forest": RandomForestClassifier(),
                "Decision_Tree": DecisionTreeClassifier(),
                "Gradient_Boosting": GradientBoostingClassifier(),
                "Logistic_Regression": LogisticRegression(),
                "XGBClassifier": XGBClassifier(),
                "CatBoosting_Classifier": CatBoostClassifier(),
                "AdaBoost_Classifier": AdaBoostClassifier()
            }

svm_model=svm.SVC(random_state=42)
svm_modelo_nu=svm.NuSVC(random_state=42)
models["SVM_Classifier"] = svm_model
stacking_model = StackingClassifier(estimators=list(models.items()),final_estimator=svm_modelo_nu,cv=10,stack_method='auto')
models["Stacking_Classifier"] = stacking_model


In [None]:
import mlflow
import mlflow.sklearn
import os
import datetime

def mlflow_tracking_training(experimental_name,X_test,X_fit,X_val,y_test,y_fit,y_val):
    caminho_pasta=f'{experimental_name}_{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'
    os.makedirs(caminho_pasta, exist_ok=True)
    os.environ["MLFLOW_ARTIFACT_ROOT"] = caminho_pasta
    print(caminho_pasta)
    mlflow.set_experiment(experimental_name)
    with mlflow.start_run():
        for i in range(len(list(models))):
            model=list(models.values())[i]
            model_name=list(models.keys())[i]
            mlflow.sklearn.log_model(model,f"{model_name}")
            print(f'--------------------Trainning {model_name} Model-------------------------------')
            
            model.fit(X_fit,y_fit)    
            train_acc=model.score(X_fit,y_fit)
            print(f'{model_name} Model Train Accuracy: {train_acc:.2f}')
            mlflow.log_metric("train_acc", train_acc)

            print(f'--------------------Vailidating {model_name} Model-------------------------------')

            pred_val=model.predict(X_val)
            acc_score_validation =accuracy_score(y_val, pred_val)            
            mlflow.log_metric("Validation Accuracy Score",acc_score_validation)
            print(f"Validation Dataset Accuracy: {acc_score_validation:0.2f}")

            report_validation = classification_report(y_val, pred_val, output_dict=True)
            df_report_validation = pd.DataFrame(report_validation).transpose()
           

            artifact_path=os.path.join(caminho_pasta,f'report_validation_{model_name}.csv')
            df_report_validation.to_csv(artifact_path)
            mlflow.log_artifact(artifact_path)

            cm=confusion_matrix(y_val,pred_val,labels=model.classes_)
            
            disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
            fig, ax = plt.subplots(figsize=(8, 6))
            disp.plot(ax=ax, values_format='d', cmap='GnBu')
            ax.set_title(f'{model_name}: Validation')

            artifact_path=os.path.join(caminho_pasta,f"{model_name}_validation_conf_matrix.png")
            plt.savefig(artifact_path)
            mlflow.log_artifact(artifact_path)

            total_samples_per_class = cm.sum(axis=1)
            precision_per_class = cm.diagonal() / total_samples_per_class
            for class_idx, precision in enumerate(precision_per_class):
                mlflow.log_metric(f"Validation Precision Score {class_idx}",precision)
                print(f'Precisão para a classe {class_idx}: {precision}')

            print(f'--------------------Testing {model_name} Model-------------------------------')
            pred_test=model.predict(X_test)
            acc_score_test=accuracy_score(y_test, pred_test)            
            mlflow.log_metric("Testing Accuracy Score",acc_score_test)
            print(f"Test Dataset Accuracy: {acc_score_test:0.2f}")

            report_testing = classification_report(y_val, pred_val, output_dict=True)
            df_report_testing = pd.DataFrame(report_testing).transpose()

            artifact_path=os.path.join(caminho_pasta,f'report_testing_{model_name}.csv')        
            df_report_testing.to_csv(artifact_path)
            mlflow.log_artifact(artifact_path)

            cm=confusion_matrix(y_test,pred_test,labels=model.classes_)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
            fig, ax = plt.subplots(figsize=(8, 6))
            disp.plot(ax=ax, values_format='d', cmap='BuPu')
            ax.set_title(f'{model_name}: Test')

            artifact_path=os.path.join(caminho_pasta,f"{model_name}_test_conf_matrix.png")
            plt.savefig(artifact_path)
            mlflow.log_artifact(artifact_path)
            

            total_samples_per_class = cm.sum(axis=1)
            precision_per_class = cm.diagonal() / total_samples_per_class
            for class_idx, precision in enumerate(precision_per_class):
                mlflow.log_metric(f"Testing Precision Score {class_idx}",precision)
                print(f'Precisão para a classe {class_idx}: {precision}')                      
            mlflow.end_run() 

        
        


    

# Mlfow

## No Balance

In [None]:
X_to_test,X_to_fit,X_to_val,y_to_test,y_to_fit,y_to_val = split_dataset(X,y)
preprocessor=preprocessing_features(X_to_fit)
X_fit=preprocessor.transform(X_to_fit)
X_test=preprocessor.transform(X_to_test)
X_val=preprocessor.transform(X_to_val)

y_labels=preprocessing_target(y_to_fit)
y_test=y_labels.transform(y_to_test)
y_fit=y_labels.transform(y_to_fit)
y_val=y_labels.transform(y_to_val)
y_fit

In [None]:
models = {
                "Random_Forest": RandomForestClassifier(random_state=42),
                "Decision_Tree": DecisionTreeClassifier(random_state=42),
                "Gradient_Boosting": GradientBoostingClassifier(random_state=42),
                "Logistic_Regression": LogisticRegression(random_state=42),
                "XGBClassifier": XGBClassifier(),
                "CatBoosting_Classifier": CatBoostClassifier(random_state=42),
                "AdaBoost_Classifier": AdaBoostClassifier(random_state=42),
                "SVM_Classifier":svm.SVC(random_state=42)
            }

svm_modelo_nu=svm.NuSVC(random_state=42)
stacking_model = StackingClassifier(estimators=list(models.items()),final_estimator=svm_modelo_nu,cv=10,stack_method='auto')
models["Stacking_Classifier"] = stacking_model


In [None]:
mlflow_tracking_training("Imbalance_Models",X_test,X_fit,X_val,y_test,y_fit,y_val)

In [None]:
mlflow.end_run()

## Imbalance with Weights

In [None]:
class_weights = {0: 0.9, 1:0.05 , 2: 0.05}
models = {
                "Random Forest": RandomForestClassifier(class_weight=class_weights,random_state=42),
                "Decision Tree": DecisionTreeClassifier(class_weight=class_weights,random_state=42),
                "Gradient Boosting": GradientBoostingClassifier(random_state=42),
                "Logistic Regression": LogisticRegression(class_weight=class_weights,random_state=42),
                "XGBClassifier": XGBClassifier(),
                "CatBoosting Classifier": CatBoostClassifier(class_weights=class_weights,random_state=42),
                "AdaBoost Classifier": AdaBoostClassifier(random_state=42),
                "SVM Classifier":svm.SVC(random_state=42,class_weight=class_weights),
            }


svm_modelo_nu=svm.NuSVC(random_state=42,class_weight=class_weights)
stacking_model = StackingClassifier(estimators=list(models.items()),final_estimator=svm_modelo_nu,cv=10,stack_method='auto')
models["Stacking Classifier"] = stacking_model


In [None]:
mlflow_tracking_training(f"Imbalance_Models_with_weights",X_test,X_fit,X_val,y_test,y_fit,y_val)

## Balance Data

In [None]:
from imblearn.over_sampling import SMOTE
ros = SMOTE(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

In [None]:
y_ros.value_counts()

In [None]:
X_to_test,X_to_fit,X_to_val,y_to_test,y_to_fit,y_to_val = split_dataset(X_ros,y_ros)
preprocessor=preprocessing_features(X_to_fit)
X_fit=preprocessor.transform(X_to_fit)
X_test=preprocessor.transform(X_to_test)
X_val=preprocessor.transform(X_to_val)

y_labels=preprocessing_target(y_to_fit)
y_test=y_labels.transform(y_to_test)
y_fit=y_labels.transform(y_to_fit)
y_val=y_labels.transform(y_to_val)

In [None]:
models = {
                "Random_Forest": RandomForestClassifier(random_state=42),
                "Decision_Tree": DecisionTreeClassifier(random_state=42),
                "Gradient_Boosting": GradientBoostingClassifier(random_state=42),
                "Logistic_Regression": LogisticRegression(random_state=42),
                "XGBClassifier": XGBClassifier(),
                "CatBoosting_Classifier": CatBoostClassifier(random_state=42),
                "AdaBoost_Classifier": AdaBoostClassifier(random_state=42),
                "SVM_Classifier":svm.SVC(random_state=42)
            }

svm_modelo_nu=svm.NuSVC(random_state=42)
stacking_model = StackingClassifier(estimators=list(models.items()),final_estimator=svm_modelo_nu,cv=10,stack_method='auto')
models["Stacking_Classifier"] = stacking_model


In [None]:
mlflow_tracking_training(f"Balance_Models_by_SMOTE",X_test,X_fit,X_val,y_test,y_fit,y_val)

In [55]:
import pickle
pickle.dump(preprocessor, open('preprocessor.pkl', 'wb') )

In [61]:
model=pickle.load(open('d:/Romario/Programar/Ferramentas/Python/mlops/maternal/notebook/mlruns/871694342626557443/577b5f2c34b6431eb36fe1112825b8e8/artifacts/Gradient_Boosting/model.pkl','rb'))

In [62]:
model.fit(X_test,y_test)

In [None]:
pickle.dump(model, open('../model.pkl', 'wb') )