# Ohio Clinic Dataset - Advanced ML Pipeline

In [None]:
!pip install ydata-profiling optuna-integration

## Mount GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Imports

In [None]:
import joblib

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

import optuna
from optuna.integration import OptunaSearchCV
from optuna.distributions import (
    IntDistribution ,
    CategoricalDistribution,
    FloatDistribution,
)

In [None]:
from sklearn import set_config

# Set sklearn to show diagrams for pipelines
set_config(display='diagram')

## Constants

In [3]:
# Set the verbose parameter to 1 to see the progress of the optimization
VERBOSE = 1

In [None]:
# Base folder
BASE_URL = "https://raw.githubusercontent.com/serivan/mldmlab/master/Datasets/"

# Define the train and test data paths
TRAIN_DATA_PATH =  BASE_URL + "mldm_ohio_clinic_train.csv"
TEST_DATA_PATH = BASE_URL + "mldm_ohio_clinic_test.csv"

# Define the random state
RANDOM_STATE = 3993

In [None]:
# Define the optuna search hyperparameters
N_TRIALS = 20
N_JOBS = 8
CROSS_VALIDATION = 3

## Load Data

In [None]:
# Load the train and test data from the CSV files using pandas

### Features Exploration

In [None]:
# Print the first few rows of the train data to explore the columns and data

In [None]:
# Print the length of the train and test data

In [None]:
# Print the frequency of each value in the target column

In [None]:
# Print the types of the columns

### Dataset Profiling (Pandas Profiling)

In [None]:
# Profile the train data using the ydata_profiling library

In [None]:
# Save the profile report to an HTML file

### Data Visualisation

In [None]:
# Show the bar chart of the Hipertension column

In [None]:
# Plotting Box Plot of Patients’ Age by No-Shows

### Data Issues Identification

In [None]:
# Count the number of missing values in each column of the train data

## Data Preprocessing

### Duplicates removal

In [None]:
# Remove duplicates from the train data

In [None]:
# Print the new shapes for train and test data

### X and y split

In [None]:
# Divide the train and test data into features and target

### Convert the target column to binary values

In [None]:
# Convert the target column to binary values

In [None]:
# Count the values for mapping check

## Classification Pipeline Definition

### In-Pipeline Columns Selection and Transformation

In [None]:
# Define the column transformation for the numerical columns considering that
# 1) there are missing values and they have to be imputed
# 2) the age column has a to be always positive
# 3) the columns have to be scaled

# Define the numerical columns to be transformed

# Create the numerical pipeline with the SimpleImputer and MinMaxScaler


In [None]:
# Define the column transformation for logical columns considering that
# 1) there are missing values and they have to be imputed

# Define the logical columns to be transformed

# Create the logical pipeline with the SimpleImputer


In [None]:
# Define the column transformation for the string columns considering that
# 1) there are missing values and they have to be imputed
# 2) the columns have to be encoded using numerical encoding

# Define the categorical columns in the string format to be transformed

# Create the string pipeline with the SimpleImputer and OneHotEncoder


### Preprocessor

In [None]:
# Define the preprocessor with the ColumnTransformer and the defined pipelines


In [None]:
# Show the preprocessor diagram


In [None]:
# Show the preprocessor parameters


###  Classification Model

In [None]:
# Define the classifier model to be used


In [None]:
# Show the classifier diagram


In [None]:
# Show the classifier parameters


### Classification Pipeline

In [None]:
# Define the classification pipeline with the preprocessor and the classifier


In [None]:
# Show the pipeline diagram


In [None]:
# Show the pipeline parameters


In [None]:
# Define the scoring metrics for the classification

### Hyperparameter Tuning (Optuna)

In [None]:
# Define the parameter search space for the classifier and the preprocessor for the OptunaSearchCV


In [None]:
# Create the optuna study for the optimization


In [None]:
# Define the OptunaSearchCV with the pipeline, the parameter search space, and the number of trials


In [None]:
# Display the OptunaSearchCV diagram


In [None]:
# Display the OptunaSearchCV parameters


## Run Experiment

###   Pipeline Training

In [None]:
# Fit the OptunaSearchCV with the train data


In [None]:
# Show the best parameters found by the OptunaSearchCV


In [None]:
# Show the best score found by the OptunaSearchCV


In [None]:
# Show the best estimator found by the OptunaSearchCV


### Pipeline Saving

In [None]:
# Save the classification pipeline to a file using joblib


### Pipeline Reloading

In [None]:
# Load the classification pipeline from a file using joblib


### Pipeline Evaluation

In [None]:
# Evaluate the classification pipeline on the test data


In [None]:
# Predict the target values using the classification pipeline


In [None]:
# Print the classification report


# Exercises

1. Find the best scaler between MinMaxScaler, StandardScaler, RobustScaler
1. Use IterativeImputer instead of SimpleImputer
1. Transform the data using PCA in the Pipeline and find the best N components
1. Change the classification model using XGBoost