# Classification

## Objectives

*   Fit and evaluate a classification model to predict if a treatment will be successful or not.


## Inputs

* outputs/datasets/collection/FertilityTreatmentData.csv.gz
* Instructions from the notebooks 02 and 04 on which variables to use for data cleaning and feature engineering.

## Outputs

* Train set (features and target)
* Test set (features and target)
* Data cleaning and Feature Engineering pipeline
* Modeling pipeline
* Feature importance plot


---

## Change working directory

Change the working directory from its current folder to its parent folder
* Access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

To make the parent of the current directory the new current directory:
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("A new current directory has been set")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

---

## Load Data

In [None]:
import numpy as np
import pandas as pd

# Open dataset
df = pd.read_csv("outputs/datasets/cleaned/FertilityTreatmentDataCleaned.csv.gz")
        
print(df.shape)
df.head(3)

---

## ML Pipeline with all data

#### ML pipeline for Data Cleaning and Feature Engineering

#### Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine.encoding import OrdinalEncoder
from src.custom_transformers import OrdinalEncoderWithCategories


# Define the natural order for each feature
categories = [
    ["18-34", "35-37", "38-39", "40-42", "43-44", "45-50"],  # Patient age at treatment
    ["0", "1", "2", "3", "4", "5", ">5"],  # Total number of previous IVF cycles
    ["18-34", "35-37", "38-39", "40-42", "43-44", "45-50"],  # Patient/Egg provider age
    [
        "18-34",
        "35-37",
        "38-39",
        "40-42",
        "43-44",
        "45-50",
        "51-55",
        "56-60",
        ">60",
    ],  # Partner/Sperm provider age
    [
        "0",
        "0 - frozen cycle",
        "1-5",
        "6-10",
        "11-15",
        "16-20",
        "21-25",
        "26-30",
        "31-35",
        "36-40",
        ">40",
    ],  # Fresh eggs collected
    [
        "0",
        "0 - frozen cycle",
        "1-5",
        "6-10",
        "11-15",
        "16-20",
        "21-25",
        "26-30",
        "31-35",
        "36-40",
        ">40",
    ],  # Total eggs mixed
    [
        "0",
        "0 - frozen cycle",
        "1-5",
        "6-10",
        "11-15",
        "16-20",
        "21-25",
        "26-30",
        ">30",
    ],  # Total embryos created
    ["0", "1", "1e", "2", "3"],  # Embryos transferred
    [
        "0 - fresh cycle",
        "0 - frozen cycle",
        "1-5",
        "6-10",
        ">10",
    ],  # Total embryos thawed
]

# Define columns that need ordinal encoding
ordinal_columns = [
    "Patient age at treatment",
    "Total number of previous IVF cycles",
    "Patient/Egg provider age",
    "Partner/Sperm provider age",
    "Fresh eggs collected",
    "Total eggs mixed",
    "Total embryos created",
    "Embryos transferred",
    "Total embryos thawed",
]


def PreprocessingPipeline():
    pipeline_base = Pipeline(
        [
            (
                "ordinal_encoding",
                OrdinalEncoderWithCategories(
                    categories=categories, columns=ordinal_columns
                ),
            ),
            (
                "ordinal_encoding_arbitrary",
                OrdinalEncoder(
                    encoding_method ='arbitrary',
                    variables=[
                        "Specific treatment type",
                        "Egg source",
                        "Sperm source",
                        "Patient ethnicity",
                        "Date of embryo transfer",
                    ],
                ),
            ),
            (
                "smart_correlation",
                SmartCorrelatedSelection(
                    method="spearman",
                    threshold=0.9,
                ),
            ),
        ]
    )

    return pipeline_base


PreprocessingPipeline()

### ML Pipeline for Modelling and Hyperparameter Optimisation

In [None]:
# Feat Scaling
from sklearn.preprocessing import StandardScaler

# Feat Selection
from sklearn.feature_selection import SelectFromModel

# ML algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
)
from xgboost import XGBClassifier


def PipelineClf(model):
    pipeline_base = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("feat_selection", SelectFromModel(model)),
            ("model", model),
        ]
    )

    return pipeline_base

Custom Class for Hyperparameter Optimisation using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:
    
    def __init__(self, models, params):
        # Initialize with dictionaries of models and their corresponding hyperparameters
        self.models = models
        self.params = params
        # Store the keys (model names) for easy iteration
        self.keys = models.keys()  
        # Initialize an empty dictionary to store grid search results
        self.grid_searches = {}    

    def fit(self, X, y, cv, n_jobs=-1, verbose=1, scoring=None, refit="f1"):
        # Fit GridSearchCV for each model with provided data and parameters
        
        for key in self.keys:
            # Inform about the model being processed and the number of parameter combinations
            print(f"\nRunning GridSearchCV for {key} with {len(self.params[key])} parameter combinations.\n")

            # Create a model pipeline (PipelineClf)
            model = PipelineClf(self.models[key])
            
            # Retrieve the corresponding hyperparameters for the model
            params = self.params[key]
            
            # Set up the GridSearchCV with the model, parameters, and specified settings
            gs = GridSearchCV(
                model,
                params,
                cv=cv,           # Cross-validation strategy
                n_jobs=n_jobs,   # Number of parallel jobs (-1 means using all processors)
                verbose=verbose, # Verbosity level
                scoring=scoring, # Scoring metrics
                refit=refit,     # Metric to refit the model on after grid search
            )
            
            # Fit the model with the provided data (X and y)
            gs.fit(X, y)
            
            # Store the result of the grid search in the dictionary under the model's name
            self.grid_searches[key] = gs
            
    def score_summary(self, sort_by="f1", scoring=None):
        # Summarize and return the results of the grid search
        
        if scoring is None:
            # Ensure a scoring dictionary is provided; otherwise, raise an error
            raise ValueError("Scoring dictionary must be provided")
        
        # List to store rows for the summary DataFrame
        rows = []
        
        for k in self.grid_searches:
            # For each model, retrieve the grid search results
            
            # Get the hyperparameter combinations tested
            params = self.grid_searches[k].cv_results_["params"]
            
            # Retrieve the mean test scores for each metric
            mean_scores = {
                metric: self.grid_searches[k].cv_results_[f"mean_test_{metric}"]
                for metric in scoring.keys()
            }
            # For each parameter combination, create a dictionary of the results
            for i in range(len(params)):
                # Add model name and hyperparameters
                row_data = {"estimator": k, **params[i]}
                # Add mean scores for each metric (averaged across CV folds)
                row_data.update({f"mean_{metric}": mean_scores[metric][i] for metric in mean_scores})
                # Convert to a Pandas Series and add to the list
                rows.append(pd.Series(row_data))
        
        # Convert the list of Series into a DataFrame and sort it by the specified metric
        df = pd.DataFrame(rows).sort_values([f"mean_{sort_by}"], ascending=False)
        
        # Return the sorted DataFrame and the dictionary of grid search results
        return df, self.grid_searches


### Split Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop(columns=["Live birth occurrence"])  # Drop the target column from features
y = df["Live birth occurrence"]  # Define the target column

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,               # Only features
    y,               # Target variable
    test_size=0.2,   # 20% of the data for testing
    random_state=0   # Random state for reproducibility
)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Apply the preprocessing pipeline

In [None]:
X_train.head(3)

In [None]:
pipeline_pre_processing = PreprocessingPipeline()
X_train = pipeline_pre_processing.fit_transform(X_train)

In [None]:
X_train.columns

In [None]:
X_train.head(3)

Apply the pipeline to the test set

In [None]:
X_test = pipeline_pre_processing.transform(X_test)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Check Train Set Target distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print(y_train.value_counts())

sns.set_style("whitegrid")
y_train.value_counts().plot(kind="bar", title="Train Set Target Distribution")
plt.show()

## Handle Target Imbalance

### Use RandomUnderSampler to balance Train Set target

Various strategies were tested to address target imbalance in the dataset (for more details, prease refer to the [README file](https://github.com/pswhdev/ivf-predictor/blob/main/README.md)). Given the large dataset size and the comparable performance of oversampling (SMOTE) and undersampling (RandomUnderSampler), RandomUnderSampler was chosen to handle imbalance for the ensemble ML model.

In [None]:
from imblearn.under_sampling import RandomUnderSampler

undersample = RandomUnderSampler(sampling_strategy='majority', random_state=0)
X_train, y_train = undersample.fit_resample(X_train, y_train)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Check Train Set Target distribution after resampling

In [None]:
import matplotlib.pyplot as plt

print(y_train.value_counts())

y_train.value_counts().plot(kind='bar', title='Train Set Target Distribution')
plt.show()

## Grid Search CV - Sklearn

### Use standard hyperparameters to find most suitable algorithm 

Set Up the Models and Parameters

In [None]:
models_quick_search = {
    "LogisticRegression": LogisticRegression(random_state=0),
    "XGBClassifier": XGBClassifier(random_state=0),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
    "RandomForestClassifier": RandomForestClassifier(random_state=0),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=0),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=0),
}

params_quick_search = {
    "LogisticRegression": {},
    "XGBClassifier": {},
    "DecisionTreeClassifier": {},
    "RandomForestClassifier": {},
    "GradientBoostingClassifier": {},
    "ExtraTreesClassifier": {},
    "AdaBoostClassifier": {},
}

Define the custom scoring metrics

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef, confusion_matrix

def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

def npv_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fn)

# Dictionary of all the metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score, pos_label=0),
    'specificity': make_scorer(specificity_score),
    'precision': make_scorer(precision_score, pos_label=1),
    'npv': make_scorer(npv_score),
    'mcc': make_scorer(matthews_corrcoef),
    'f1': make_scorer(f1_score, pos_label=1)
}

Quick GridSearch CV - Binary Classifier

In [None]:
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)

search.fit(X_train, y_train, cv=5, n_jobs=-1, scoring=scoring, refit='f1')

Check results

In [None]:
# Get the results and sort by the F1 score
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='f1', scoring=scoring)
grid_search_summary


### Perform extensive search on the most suitable algorithm to find the best hyperparameter configuration using GridSearchCV

Model and hyperparameters and for Extensive Search: Best model chosen was Greadient Boosting Classifier because it showed the highest mean_f1 and the hyperparameters were chosen to try to find the optimal settings to balance model performance and prevent overfitting. [Scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html) 

In [None]:
models_search = {
    'GradientBoostingClassifier': GradientBoostingClassifier (),
}

params_search = {
    'GradientBoostingClassifier': {
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__max_depth': [3, 4, 5],
        'model__subsample': [0.8, 0.9, 1.0],
        'model__min_samples_split': [2, 5, 10],
        'model__max_features': [None, 'sqrt', 'log2']
    }
}

Extensive GridSearch CV - Binary Classifier

In [None]:
print(y_train.value_counts())

search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, cv=5, n_jobs=-1, scoring=scoring, refit='f1')

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='f1', scoring=scoring)
grid_search_summary 

Get best model name programmatically

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

Parameters for best model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

Define the best clf pipeline

In [None]:
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

## Assess feature importance

In [None]:
X_train.head(3)

Assess feature importance on the current model with `.features_importances_`

In [None]:
# create DataFrame to display feature importance
df_feature_importance = pd.DataFrame(
    data={
        "Feature": X_train.columns[pipeline_clf["feat_selection"].get_support()],
        "Importance": pipeline_clf["model"].feature_importances_,
    }
).sort_values(by="Importance", ascending=False)

# re-assign best_features order
best_features = df_feature_importance["Feature"].to_list()

# Most important features statement and plot
print(
    f"* These are the {len(best_features)} most important features in descending order. "
    f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}"
)

df_feature_importance.plot(kind="bar", x="Feature", y="Importance")
plt.show()

## Evaluate Pipeline on Train and Test Sets

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import pandas as pd

def confusion_matrix_and_report(X, y, pipeline, label_map):
    # Make predictions using the pipeline
    prediction = pipeline.predict(X)

    print("---  Confusion Matrix  ---")
    print(
        pd.DataFrame(
            confusion_matrix(y_true=y, y_pred=prediction),
            columns=["Actual " + label for label in label_map],
            index=["Prediction " + label for label in label_map]
        )
    )
    print("\n")

    # Print classification report
    print("---  Classification Report  ---")
    print(classification_report(y, prediction, target_names=label_map), "\n")

    # Calculate F1 scores
    f1_scores = f1_score(y, prediction, average=None)
    mean_f1_score = f1_scores.mean()
    
    # Print F1 scores with two decimal places
    f1_scores_rounded = {label: round(score, 2) for label, score in zip(label_map, f1_scores)}
    mean_f1_score_rounded = round(mean_f1_score, 2)
    
    print(f"F1 Score for each class: {f1_scores_rounded}")
    print(f"Mean F1 Score: {mean_f1_score_rounded}\n")
    

def clf_performance(X_train, y_train, X_test, y_test, pipeline, label_map):
    # Evaluate performance on the training set
    print("#### Train Set #### \n")
    confusion_matrix_and_report(X_train, y_train, pipeline, label_map)

    # Evaluate performance on the test set
    print("#### Test Set ####\n")
    confusion_matrix_and_report(X_test, y_test, pipeline, label_map)


Evaluation:

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=pipeline_clf,
                label_map= ['No Success', 'Success'] 
                )

---

## Refit pipeline with best features

### Refit ML Pipeline and Resampling

### Rewrite ML pipeline for Data Cleaning and Feature Engineering

In [None]:
best_features 

#### Reassign best features names

One hot encoding create new columns and appends the value after the name as `_<value>`.
Therefore it is necessary to rename the features to the original name, so when the pipeline is excecuted, it will find the correct columns on the dataframe.

### Split Train Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Live birth occurrence'], axis=1),
    df['Live birth occurrence'],
    test_size=0.2,
    random_state=0,
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

New Pipeline for Feature Engineering considering only the best features and without Smart Correlation (manually entered).

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine.encoding import OrdinalEncoder
from src.custom_transformers import OrdinalEncoderWithCategories


# Define the correct order for each feature
categories = [
    ["18-34", "35-37", "38-39", "40-42", "43-44", "45-50"],  # Patient/Egg provider age
    [
        "0",
        "0 - frozen cycle",
        "1-5",
        "6-10",
        "11-15",
        "16-20",
        "21-25",
        "26-30",
        ">30",
    ],  # Total embryos created
    ["0", "1", "1e", "2", "3"],  # Embryos transferred
]

# Define columns that need ordinal encoding
ordinal_columns = [
    "Patient/Egg provider age",
    "Total embryos created",
    "Embryos transferred",

]


def PreprocessingPipeline():
    pipeline_base = Pipeline(
        [
            (
                "ordinal_encoding",
                OrdinalEncoderWithCategories(
                    categories=categories, columns=ordinal_columns
                ),
            ),
            (
                "ordinal_encoding_arbitrary",
                OrdinalEncoder(
                    encoding_method ='arbitrary',
                    variables=[
                        "Date of embryo transfer",
                    ],
                ),
            ),
            # No need for smart correlation, since best features are already selected
        ]
    )

    return pipeline_base


pipeline_pre_processing = PreprocessingPipeline()
pipeline_pre_processing

### Rewrite ML Pipeline for Modelling

Function for Pipeline optmisation

In [None]:
# Pipeline Optmization: Model
def PipelineClf(model):
    pipeline_base = Pipeline([
        ("scaler", StandardScaler()),
        # no feature selection needed anymore since it is already known which features to use
        ("model", model),
    ])

    return pipeline_base

Filter only the most important variables

In [None]:
X_train = X_train.filter(best_features)
X_test = X_test.filter(best_features)

In [None]:
# Apply the prerpocessing pipeline
X_train = pipeline_pre_processing.fit_transform(X_train)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
X_train.head(3)

Apply the data cleaning and feature engineering pipeline

Apply the pipeline to the test set

In [None]:
X_test = pipeline_pre_processing.transform(X_test)

Check Train Set Target distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print(y_train.value_counts())

sns.set_style("whitegrid")
y_train.value_counts().plot(kind="bar", title="Train Set Target Distribution")
plt.show()

### Handle Target Imbalance

Rebalance Train Set target

In [None]:
print("Data types of transformed X_train:")
print(X_train.dtypes)
print(X_train.head(3))

In [None]:
from imblearn.under_sampling import RandomUnderSampler

undersample = RandomUnderSampler(sampling_strategy='majority', random_state=0)
X_train, y_train = undersample.fit_resample(X_train, y_train)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Check Train Set Target distribution after rebalance

In [None]:
y_train.value_counts().plot(kind='bar',title='Train Set Target Distribution')
plt.show()

### Grid Search CV: Sklearn

Using the most suitable model from the last section and its best hyperparameter configuration.

We are using the same model from  the last GridCV search

In [None]:
models_search 

And the best parameters from the last GridCV search 

In [None]:
best_parameters

Manually type in the hyperparameter values have to be a list.

In [None]:
params_search = {
    'GradientBoostingClassifier': {
        'model__learning_rate': [0.01],
        'model__max_depth': [3],
        'model__max_features': [None],
        'model__min_samples_split': [5],
        'model__n_estimators': [100],
        'model__subsample': [0.8],
    }
}
params_search

GridSearch CV

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, cv=5, n_jobs=-1, scoring=scoring, refit="f1")

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='f1', scoring=scoring)
grid_search_summary

Define the best clf pipeline

In [None]:
best_model = grid_search_summary.iloc[0, 0]
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

### Assess feature importance

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# Sample data for feature importances
new_best_features = X_train.columns
df_feature_importance = pd.DataFrame({
    'Feature': new_best_features,
    'Importance': pipeline_clf['model'].feature_importances_
}).sort_values(by='Importance', ascending=False)

# Function to reassign features to their original names
def reassign_best_features(features):
    # Remove one-hot encoded suffixes and return the base names
    reassigned_best_features = [re.sub(r'_[^_]+$', '', feature) for feature in features]
    return reassigned_best_features

# Apply the reassignment to create a new list of base feature names
df_feature_importance['BaseFeature'] = reassign_best_features(df_feature_importance['Feature'])

# Group by the original base feature names and sum the importances
aggregated_importances = df_feature_importance.groupby('BaseFeature')['Importance'].sum().reset_index()

# Sort the aggregated feature importances in descending order
aggregated_importances = aggregated_importances.sort_values(by='Importance', ascending=False)

# Print most important features
print(f"* These are the {len(aggregated_importances)} most important features in descending order. "
      f"The model was trained on them: \n{aggregated_importances['BaseFeature'].to_list()}")

# Plot the aggregated feature importances
aggregated_importances.plot(kind='bar', x='BaseFeature', y='Importance')
plt.title('Aggregated Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.show()


### Evaluate Pipeline on Train and Test Sets

Evaluation: We cross-check with metrics defined in the ML business case.
* 70% Recall for No Success, on train and test set.
* 70% Precision for Success on train and test set. 

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=pipeline_clf,
                label_map= ['No Success', 'Success'] 
                )

## Push files to Repo

Generate the following files
* Train set
* Test set
* Data cleaning and Feature Engineering pipeline
* Modeling pipeline
* features importance plot

In [None]:
import joblib
import os

version = 'v1'
file_path = f'outputs/ml_pipeline/ivf_success_predictor/{version}'

try:
    # Check if the directories exist
    if os.path.exists(file_path):
        print('Old version is already available. Please create a new version.')
    else:
        # Create the directory if it does not exist
        os.makedirs(name=file_path)
        print(f"Directory {file_path} created successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

### Train Set

* note that the variables **are transformed already** in X_train after imbalance handling was applied.

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv.gz", index=False, compression='gzip')

In [None]:
y_train

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv.gz", index=False, compression='gzip')

### Test Set

* note that the variables are transformed already in X_test

In [None]:
print(X_test.shape)
X_test.head()

In [None]:
X_test.to_csv(f"{file_path}/X_test.csv.gz", index=False, compression='gzip')

In [None]:
y_test

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv.gz", index=False, compression='gzip')

#### Save Best features (to be used for generating the widgets on the live predition page)

In [None]:
best_features

In [None]:
# Create a DataFrame
best_features_df = pd.DataFrame(best_features, columns=['feature']) 

best_features_df = best_features_df.drop_duplicates()

best_features_df.to_csv(f'{file_path}/best_features.csv.gz', index=False, compression='gzip')


### ML Pipelines: Data Cleaning and Feat Eng pipeline and Modelling Pipeline

Both pipelines should be used in conjunction to predict Live Data.

* To predict on Train Set, Test Set we use only pipeline_clf, since the data is already processed.

Pipeline responsible for Data Cleaning and Feature Engineering.

In [None]:
pipeline_pre_processing

In [None]:
import joblib
import gzip

with gzip.open(f"{file_path}/clf_pipeline_pre_processing.pkl.gz", 'wb') as f_out:
    joblib.dump(pipeline_pre_processing, f_out)

  Pipeline responsible for Feature Scaling, and Model

In [None]:
pipeline_clf

### Save the model as pkl compressed

In [None]:
import joblib
import gzip

# Save the model directly into a compressed gzip file
with gzip.open(f"{file_path}/clf_pipeline_model.pkl.gz", 'wb') as f_out:
    joblib.dump(pipeline_clf, f_out)

### Feature Importance plot

In [None]:
aggregated_importances.plot(kind='bar', x='BaseFeature', y='Importance')
plt.title('Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.show()


In [None]:
aggregated_importances.plot(kind='bar', x='BaseFeature', y='Importance')
plt.title('Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')

---