# Ohio Clinic Dataset - Advanced ML Pipeline

In [3]:
!pip install ydata-profiling optuna-integration

Collecting ydata-profiling
  Downloading ydata_profiling-4.8.3-py2.py3-none-any.whl (359 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/359.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m358.4/359.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m359.5/359.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna-integration
  Downloading optuna_integration-3.6.0-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.4/93.4 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting visions[type_image_path]<0.7.7,>=0.7.5 (from ydata-profiling)
  Downloading visions-0.7.6-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.8/104.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htm

## Mount GDrive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

## Imports

In [None]:
import joblib

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

import optuna
from optuna.integration import OptunaSearchCV
from optuna.distributions import (
    IntDistribution ,
    CategoricalDistribution,
    FloatDistribution,
)

In [None]:
from sklearn import set_config

# Set sklearn to show diagrams for pipelines
set_config(display='diagram')

## Constants

In [5]:
# Set the verbose parameter to 1 to see the progress of the optimization
VERBOSE = 1

In [6]:
# Base folder
BASE_FOLDER = "/content/drive/MyDrive/Colab Notebooks/mldm_2024/data/"

# Define the train and test data paths
TRAIN_DATA_PATH =  BASE_FOLDER + "mldm_ohio_clinic_train.csv"
TEST_DATA_PATH = BASE_FOLDER + "mldm_ohio_clinic_test.csv"

# Define the random state
RANDOM_STATE = 3993

In [7]:
# Define the optuna search hyperparameters
N_TRIALS = 20
N_JOBS = 8
CROSS_VALIDATION = 3

## Load Data

In [8]:
# Load the train and test data from the CSV files using pandas

### Features Exploration

In [9]:
# Print the first few rows of the train data to explore the columns and data

In [10]:
# Print the length of the train and test data

In [11]:
# Print the frequency of each value in the target column

In [12]:
# Print the types of the columns

### Dataset Profiling (Pandas Profiling)

In [13]:
# Profile the train data using the ydata_profiling library

In [14]:
# Save the profile report to an HTML file

### Data Visualisation

In [15]:
# Show the bar chart of the Hipertension column

In [16]:
# Plotting Box Plot of Patients’ Age by No-Shows

### Data Issues Identification

In [17]:
# Count the number of missing values in each column of the train data

## Data Preprocessing

### Duplicates removal

In [18]:
# Remove duplicates from the train data

In [19]:
# Print the new shapes for train and test data

### X and y split

In [20]:
# Divide the train and test data into features and target

### Convert the target column to binary values

In [21]:
# Convert the target column to binary values

In [22]:
# Count the values for mapping check

## Classification Pipeline Definition

### In-Pipeline Columns Selection and Transformation

In [23]:
# Define the column transformation for the numerical columns considering that
# 1) there are missing values and they have to be imputed
# 2) the age column has a to be always positive
# 3) the columns have to be scaled

# Define the numerical columns to be transformed

# Create the numerical pipeline with the SimpleImputer and MinMaxScaler


In [24]:
# Define the column transformation for logical columns considering that
# 1) there are missing values and they have to be imputed

# Define the logical columns to be transformed

# Create the logical pipeline with the SimpleImputer


In [25]:
# Define the column transformation for the string columns considering that
# 1) there are missing values and they have to be imputed
# 2) the columns have to be encoded using numerical encoding

# Define the categorical columns in the string format to be transformed

# Create the string pipeline with the SimpleImputer and OneHotEncoder


### Preprocessor

In [26]:
# Define the preprocessor with the ColumnTransformer and the defined pipelines


In [27]:
# Show the preprocessor diagram


In [28]:
# Show the preprocessor parameters


###  Classification Model

In [29]:
# Define the classifier model to be used


In [30]:
# Show the classifier diagram


In [31]:
# Show the classifier parameters


### Classification Pipeline

In [32]:
# Define the classification pipeline with the preprocessor and the classifier


In [33]:
# Show the pipeline diagram


In [34]:
# Show the pipeline parameters


In [35]:
# Define the scoring metrics for the classification

### Hyperparameter Tuning (Optuna)

In [36]:
# Define the parameter search space for the classifier and the preprocessor for the OptunaSearchCV


In [37]:
# Create the optuna study for the optimization


In [38]:
# Define the OptunaSearchCV with the pipeline, the parameter search space, and the number of trials


In [39]:
# Display the OptunaSearchCV diagram


In [40]:
# Display the OptunaSearchCV parameters


## Run Experiment

###   Pipeline Training

In [41]:
# Fit the OptunaSearchCV with the train data


In [42]:
# Show the best parameters found by the OptunaSearchCV


In [43]:
# Show the best score found by the OptunaSearchCV


In [44]:
# Show the best estimator found by the OptunaSearchCV


### Pipeline Saving

In [45]:
# Save the classification pipeline to a file using joblib


### Pipeline Reloading

In [46]:
# Load the classification pipeline from a file using joblib


### Pipeline Evaluation

In [47]:
# Evaluate the classification pipeline on the test data


In [48]:
# Predict the target values using the classification pipeline


In [49]:
# Print the classification report


# Exercises

1. Find the best scaler between MinMaxScaler, StandardScaler, RobustScaler
1. Use IterativeImputer instead of SimpleImputer
1. Transform the data using PCA in the Pipeline and find the best N components
1. Change the classification model using XGBoost