# Classification

## Objectives

*   Fit and evaluate a classification model to predict if a treatment will be successful or not.


## Inputs

* outputs/datasets/collection/FertilityTreatmentData.csv.gz
* Instructions from the notebooks 02 and 04 on which variables to use for data cleaning and feature engineering.

## Outputs

* Train set (features and target)
* Test set (features and target)
* Data cleaning and Feature Engineering pipeline
* Modeling pipeline
* Feature importance plot


---

## Change working directory

Change the working directory from its current folder to its parent folder
* Access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

To make the parent of the current directory the new current directory:
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("A new current directory has been set")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

---

## Load Data

In [None]:
import numpy as np
import pandas as pd

# Open dataset
df = pd.read_csv("outputs/datasets/cleaned/FertilityTreatmentDataCleaned.csv")
        
print(df.shape)
df.head(3)

---

## ML Pipeline with all data

#### ML pipeline for Data Cleaning and Feature Engineering

#### Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine.encoding import OneHotEncoder, OrdinalEncoder


def PreprocessingPipeline():
    pipeline_base = Pipeline(
        [
            (
                "ordinal_encoding",
                OrdinalEncoder(
                    encoding_method='arbitrary',
                    variables=[
                        "Patient age at treatment",
                        "Partner/Sperm provider age",
                        "Patient/Egg provider age",
                        "Total number of previous IVF cycles",
                        "Fresh eggs collected",
                        "Total eggs mixed",
                        "Total embryos created",
                        "Embryos transferred",
                        "Total embryos thawed",
                    ]
                ),
            ),
            (
                "one_hot_encoding",
                OneHotEncoder(
                    variables=[
                        "Specific treatment type",
                        "Egg source",
                        "Sperm source",
                        "Patient ethnicity",
                        "Partner ethnicity",
                        "Date of embryo transfer",

                    ]
                ),
            ),
            
            (
                "smart_correlation",
                SmartCorrelatedSelection(
                    method="spearman",
                    threshold=0.6,
                    selection_method="variance"
                ),
            ),
        ]
    )

    return pipeline_base


PreprocessingPipeline()

### ML Pipeline for Modelling and Hyperparameter Optimisation

In [None]:
# Feat Scaling
from sklearn.preprocessing import StandardScaler

# Feat Selection
from sklearn.feature_selection import SelectFromModel

# ML algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
)
from xgboost import XGBClassifier


def PipelineClf(model):
    pipeline_base = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("feat_selection", SelectFromModel(model)),
            ("model", model),
        ]
    )

    return pipeline_base

Custom Class for Hyperparameter Optimisation using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:
    
    def __init__(self, models, params):
        # Initialize with dictionaries of models and their corresponding hyperparameters
        self.models = models
        self.params = params
        # Store the keys (model names) for easy iteration
        self.keys = models.keys()  
        # Initialize an empty dictionary to store grid search results
        self.grid_searches = {}    

    def fit(self, X, y, cv, n_jobs=-1, verbose=1, scoring=None, refit="f1"):
        # Fit GridSearchCV for each model with provided data and parameters
        
        for key in self.keys:
            # Inform about the model being processed and the number of parameter combinations
            print(f"\nRunning GridSearchCV for {key} with {len(self.params[key])} parameter combinations.\n")

            # Create a model pipeline (PipelineClf)
            model = PipelineClf(self.models[key])
            
            # Retrieve the corresponding hyperparameters for the model
            params = self.params[key]
            
            # Set up the GridSearchCV with the model, parameters, and specified settings
            gs = GridSearchCV(
                model,
                params,
                cv=cv,           # Cross-validation strategy
                n_jobs=n_jobs,   # Number of parallel jobs (-1 means using all processors)
                verbose=verbose, # Verbosity level
                scoring=scoring, # Scoring metrics
                refit=refit,     # Metric to refit the model on after grid search
            )
            
            # Fit the model with the provided data (X and y)
            gs.fit(X, y)
            
            # Store the result of the grid search in the dictionary under the model's name
            self.grid_searches[key] = gs
            
    def score_summary(self, sort_by="f1", scoring=None):
        # Summarize and return the results of the grid search
        
        if scoring is None:
            # Ensure a scoring dictionary is provided; otherwise, raise an error
            raise ValueError("Scoring dictionary must be provided")
        
        # List to store rows for the summary DataFrame
        rows = []
        
        for k in self.grid_searches:
            # For each model, retrieve the grid search results
            
            # Get the hyperparameter combinations tested
            params = self.grid_searches[k].cv_results_["params"]
            
            # Retrieve the mean test scores for each metric
            mean_scores = {
                metric: self.grid_searches[k].cv_results_[f"mean_test_{metric}"]
                for metric in scoring.keys()
            }
            # For each parameter combination, create a dictionary of the results
            for i in range(len(params)):
                # Add model name and hyperparameters
                row_data = {"estimator": k, **params[i]}
                # Add mean scores for each metric (averaged across CV folds)
                row_data.update({f"mean_{metric}": mean_scores[metric][i] for metric in mean_scores})
                # Convert to a Pandas Series and add to the list
                rows.append(pd.Series(row_data))
        
        # Convert the list of Series into a DataFrame and sort it by the specified metric
        df = pd.DataFrame(rows).sort_values([f"mean_{sort_by}"], ascending=False)
        
        # Return the sorted DataFrame and the dictionary of grid search results
        return df, self.grid_searches


### Split Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    # Do not drop the target column here because it is needed for the pipeline
    df,
    df["Live birth occurrence"],
    test_size=0.2,
    random_state=0,
)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Apply the data cleaning and feature engineering pipeline

In [None]:
X_train.head(3)

In [None]:
pipeline_data_cleaning_feat_eng = PreprocessingPipeline()
X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train)

In [None]:
print(X_train.columns)

In [None]:
X_train.head(3)

Apply the pipeline to the test set

In [None]:
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)

Drop the target column from the processed X_train and X_test

In [None]:
X_train = X_train.drop(["Live birth occurrence"], axis=1)
X_test = X_test.drop(["Live birth occurrence"], axis=1)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Check Train Set Target distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print(y_train.value_counts())

sns.set_style("whitegrid")
y_train.value_counts().plot(kind="bar", title="Train Set Target Distribution")
plt.show()

## Handle Target Imbalance

### Use SMOTE (Synthetic Minority Oversampling TEchnique) to balance Train Set target

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(sampling_strategy='minority', random_state=0)
X_train, y_train = oversample.fit_resample(X_train, y_train)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Check Train Set Target distribution after resampling

In [None]:
import matplotlib.pyplot as plt

print(y_train.value_counts())

y_train.value_counts().plot(kind='bar', title='Train Set Target Distribution')
plt.show()

## Grid Search CV - Sklearn

### Use standard hyperparameters to find most suitable algorithm 

Set Up the Models and Parameters

In [None]:
models_quick_search = {
    "LogisticRegression": LogisticRegression(random_state=0),
    "XGBClassifier": XGBClassifier(random_state=0),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
    "RandomForestClassifier": RandomForestClassifier(random_state=0),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=0),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=0),
    "XGBLogitBoost": XGBClassifier(objective="binary:logistic", random_state=0)
}

params_quick_search = {
    "LogisticRegression": {},
    "XGBClassifier": {},
    "DecisionTreeClassifier": {},
    "RandomForestClassifier": {},
    "GradientBoostingClassifier": {},
    "ExtraTreesClassifier": {},
    "AdaBoostClassifier": {},
    "XGBLogitBoost": {}
}

Define the custom scoring metrics

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef, confusion_matrix

def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

def npv_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fn)

# Dictionary of all the metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score, pos_label=0),
    'specificity': make_scorer(specificity_score),
    'precision': make_scorer(precision_score, pos_label=1),
    'npv': make_scorer(npv_score),
    'mcc': make_scorer(matthews_corrcoef),
    'f1': make_scorer(f1_score, pos_label=1)
}

Quick GridSearch CV - Binary Classifier

In [None]:
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)

search.fit(X_train, y_train, cv=5, n_jobs=-1, scoring=scoring, refit='f1')

Check results

In [None]:
# Get the results and sort by the F1 score
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='f1', scoring=scoring)
grid_search_summary


### Perform extensive and comprehensive search on the most suitable algorithm to find the best hyperparameter configuration using GridSearchCV

Define model and parameters, for Extensive Search

In [None]:
models_search = {
    'RandomForestClassifier': RandomForestClassifier (),
}

params_search = {
    'RandomForestClassifier': {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [10, 20, 30],
        'model__min_samples_leaf': [2, 5, 10],
        'model__min_samples_split': [10, 15, 20],
        'model__max_features': ['sqrt', 'log2', 0.75],
        'model__class_weight': [{0: 1, 1: 2}, "balanced", None], 
    }
}

Extensive GridSearch CV - Binary Classifier

In [None]:
print(y_train.value_counts())

search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, cv=5, n_jobs=-1, scoring=scoring, refit='f1')

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='f1', scoring=scoring)
grid_search_summary 

Get best model name programmatically

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

Parameters for best model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

Define the best clf pipeline

In [None]:
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

## Assess feature importance

In [None]:
X_train.head(3)

Assess feature importance on the current model with `.features_importances_`

In [None]:
# create DataFrame to display feature importance
df_feature_importance = pd.DataFrame(
    data={
        "Feature": X_train.columns[pipeline_clf["feat_selection"].get_support()],
        "Importance": pipeline_clf["model"].feature_importances_,
    }
).sort_values(by="Importance", ascending=False)

# re-assign best_features order
best_features = df_feature_importance["Feature"].to_list()

# Most important features statement and plot
print(
    f"* These are the {len(best_features)} most important features in descending order. "
    f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}"
)

df_feature_importance.plot(kind="bar", x="Feature", y="Importance")
plt.show()

## Evaluate Pipeline on Train and Test Sets

In [None]:
from sklearn.metrics import classification_report, confusion_matrix


def confusion_matrix_and_report(X, y, pipeline, label_map):

    prediction = pipeline.predict(X)

    print("---  Confusion Matrix  ---")
    print(
        pd.DataFrame(
            confusion_matrix(y_true=prediction, y_pred=y),
            columns=[["Actual " + sub for sub in label_map]],
            index=[["Prediction " + sub for sub in label_map]],
        )
    )
    print("\n")

    print("---  Classification Report  ---")
    print(classification_report(y, prediction, target_names=label_map), "\n")


def clf_performance(X_train, y_train, X_test, y_test, pipeline, label_map):
    print("#### Train Set #### \n")
    confusion_matrix_and_report(X_train, y_train, pipeline, label_map)

    print("#### Test Set ####\n")
    confusion_matrix_and_report(X_test, y_test, pipeline, label_map)

Evaluation:

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=pipeline_clf,
                label_map= ['No Success', 'Success'] 
                )

---

## Refit pipeline with best features

### Refit ML Pipeline and Resampling

### Rewrite ML pipeline for Data Cleaning and Feature Engineering

In [None]:
best_features 

In [None]:
# Mapping dictionary for encoded columns to original names
encoded_to_original = {
    # Specific treatment type
    "Specific treatment type_IVF": "Specific treatment type",
    "Specific treatment type_ICSI:IVF": "Specific treatment type",
    "Specific treatment type_ICSI:Unknown": "Specific treatment type",
    "Specific treatment type_IVF:Unknown": "Specific treatment type",

    # Egg source
    "Egg source_Donor": "Egg source",

    # Sperm source
    "Sperm source_Partner": "Sperm source",

    # Patient ethnicity
    "Patient ethnicity_Other": "Patient ethnicity",
    "Patient ethnicity_Asian": "Patient ethnicity",
    "Patient ethnicity_Black": "Patient ethnicity",
    "Patient ethnicity_Mixed": "Patient ethnicity",

    # Partner ethnicity
    "Partner ethnicity_White": "Partner ethnicity",
    "Partner ethnicity_Mixed": "Partner ethnicity",
    "Partner ethnicity_Any other ethnicity": "Partner ethnicity",

    # Date of embryo transfer
    "Date of embryo transfer_5 - fresh": "Date of embryo transfer",
    "Date of embryo transfer_3 - fresh": "Date of embryo transfer",
    "Date of embryo transfer_2 - fresh": "Date of embryo transfer",
    "Date of embryo transfer_NT": "Date of embryo transfer",
    "Date of embryo transfer_4 - fresh": "Date of embryo transfer",
    "Date of embryo transfer_6 - fresh": "Date of embryo transfer",
    "Date of embryo transfer_1 - frozen": "Date of embryo transfer",
    "Date of embryo transfer_Missing": "Date of embryo transfer",
    "Date of embryo transfer_4 - frozen": "Date of embryo transfer",
    "Date of embryo transfer_3 - frozen": "Date of embryo transfer",
    "Date of embryo transfer_2 - frozen": "Date of embryo transfer",
    "Date of embryo transfer_6 - frozen": "Date of embryo transfer",
    "Date of embryo transfer_0 - fresh": "Date of embryo transfer",
    "Date of embryo transfer_1 - fresh": "Date of embryo transfer",
    "Date of embryo transfer_5 - frozen": "Date of embryo transfer",
    "Date of embryo transfer_7 - fresh": "Date of embryo transfer",
    "Date of embryo transfer_2 - Mixed fresh/frozen": "Date of embryo transfer",
    "Date of embryo transfer_5 - Mixed fresh/frozen": "Date of embryo transfer",
    "Date of embryo transfer_6 - Mixed fresh/frozen": "Date of embryo transfer",
    "Date of embryo transfer_7 - frozen": "Date of embryo transfer",
}

def reassign_best_features(features, mapping, df=None):
    """
    Reassigns encoded feature names to their original names and removes duplicated columns in DataFrame.
    """
    # Reassign encoded feature names to original names
    reassigned_features = [mapping.get(feature, feature) for feature in features]

    # If a DataFrame is provided, check for duplicated columns
    if df is not None:
        duplicated_columns = df.columns[df.columns.duplicated()].tolist()
        if duplicated_columns:
            print(f"Duplicated columns detected: {duplicated_columns}")
            # Remove duplicated columns (keep first occurrence)
            df = df.loc[:, ~df.columns.duplicated()]
            print("Duplicated columns removed.")
        else:
            print("No duplicated columns detected.")
        return reassigned_features, df
    
    return reassigned_features




## Split Train Test Set, considering only best features

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Live birth occurrence'], axis=1),
    df['Live birth occurrence'],
    test_size=0.2,
    random_state=0,
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Reassign best features to match the encoded columns used in the pipeline

In [None]:
reassigned_best_features = reassign_best_features(best_features, encoded_to_original)

New Pipeline for DataCleaning And FeatureEngineering considering only the best features and without Smart Correlation.

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.encoding import OneHotEncoder, OrdinalEncoder


def PipelineBestFeatures():
    pipeline_base = Pipeline(
        [
            (
                "ordinal_encoding",
                OrdinalEncoder(
                    encoding_method="arbitrary",
                    variables=[
                        "Patient age at treatment",
                        "Partner/Sperm provider age",
                        "Total number of previous IVF cycles",
                        "Fresh eggs collected",
                        "Embryos transferred",
                    ],
                ),
            ),
            (
                "one_hot_encoding",
                OneHotEncoder(
                    variables=[
                        "Date of embryo transfer",
                        "Partner ethnicity"
                    ]
                ),
            ),
        ]
    )

    return pipeline_base

PipelineBestFeatures()

### Rewrite ML Pipeline for Modelling

Function for Pipeline optmisation

In [None]:
# Pipeline Optmization: Model
def PipelineClf(model):
    pipeline_base = Pipeline([
        ("scaler", StandardScaler()),
        # no feature selection needed anymore since it is already known which features to use
        ("model", model),
    ])

    return pipeline_base


Filter only the most important variables

In [None]:
# Apply the data cleaning and feature engineering pipeline
pipeline_data_cleaning_feat_eng = PipelineBestFeatures()
X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
X_train.head(3)

### Handle Target Imbalance

Apply the data cleaning and feature engineering pipeline

Apply the pipeline to the test set

In [None]:
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)


Remove duplicates after transformation

In [None]:

reassigned_best_features, X_train = reassign_best_features(reassigned_best_features, encoded_to_original, X_train)

Filter the DataFrame to match the best features

In [None]:

X_train = X_train.filter(reassigned_best_features)
X_test = X_test.filter(reassigned_best_features)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
X_train.head(3)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Check Train Set Target distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print(y_train.value_counts())

sns.set_style("whitegrid")
y_train.value_counts().plot(kind="bar", title="Train Set Target Distribution")
plt.show()

Use SMOTE to balance Train Set target

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(sampling_strategy='minority', random_state=0)
X_train, y_train = oversample.fit_resample(X_train, y_train)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Check Train Set Target distribution after SMOTE

In [None]:
y_train.value_counts().plot(kind='bar',title='Train Set Target Distribution')
plt.show()

### Grid Search CV: Sklearn

Using the most suitable model from the last section and its best hyperparameter configuration.

We are using the same model from  the last GridCV search

In [None]:
models_search 

And the best parameters from the last GridCV search 

In [None]:
best_parameters

You will need to type in manually since the hyperparameter values have to be a list. The previous dictionary is not in this format.

In [None]:
params_search = {
    "RandomForestClassifier": {
        "model__class_weight": [{0: 1, 1: 2}],
        "model__max_depth": [20],
        "model__max_features": [0.75],
        "model__min_samples_leaf": [2],
        "model__min_samples_split": [10],
        "model__n_estimators": [300],
    }
}
params_search

GridSearch CV

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, cv=5, n_jobs=-1, scoring=scoring, refit="f1")

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='f1', scoring=scoring)
grid_search_summary

Define the best clf pipeline

In [None]:
best_model = grid_search_summary.iloc[0, 0]
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

### Assess feature importance

In [None]:
best_features = X_train.columns

# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    'Feature': best_features,
    'Importance': pipeline_clf['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)


# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()


### Evaluate Pipeline on Train and Test Sets

Evaluation: We cross-check with metrics defined in the ML business case.
* 70% Recall for No Success, on train and test set.
* 70% Precision for Success on train and test set. 

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=pipeline_clf,
                label_map= ['No Success', 'Success'] 
                )

## Push files to Repo

Generate the following files
* Train set
* Test set
* Data cleaning and Feature Engineering pipeline
* Modeling pipeline
* features importance plot

In [None]:
import joblib
import os

version = 'v1'
file_path = f'outputs/ml_pipeline/ivf_success_predictor/{version}'

try:
    # Check if the directories exist
    if os.path.exists(file_path):
        print('Old version is already available. Please create a new version.')
    else:
        # Create the directory if it does not exist
        os.makedirs(name=file_path)
        print(f"Directory {file_path} created successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

### Train Set

* note that the variables **are transformed already** in X_train after SMOTE was applied.

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

### Test Set

* note that the variables are transformed already in X_test

In [None]:
print(X_test.shape)
X_test.head()

In [None]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

#### Save Best features (to be used for generating the widgets on the live predition page)

In [None]:
best_features

In [152]:
reassigned_best_features

['Date of embryo transfer',
 'Partner/Sperm provider age',
 'Embryos transferred',
 'Total number of previous IVF cycles',
 'Patient age at treatment',
 'Fresh eggs collected',
 'Total number of previous pregnancies - IVF and DI',
 'Embryos transferred from eggs micro-injected',
 'Partner ethnicity',
 'Causes of infertility - male factor',
 'Causes of infertility - patient unexplained',
 'Date of embryo transfer']

In [153]:
# Create a DataFrame
reassigned_best_features_df = pd.DataFrame(reassigned_best_features, columns=['feature']) 

reassigned_best_features_df = reassigned_best_features_df.drop_duplicates()

reassigned_best_features_df.to_csv(f'{file_path}/best_features.csv', index=False)

print(f"Best features saved to {file_path}/best_features.csv")

Best features saved to outputs/ml_pipeline/ivf_success_predictor/v1/best_features.csv


### ML Pipelines: Data Cleaning and Feat Eng pipeline and Modelling Pipeline

Both pipelines should be used in conjunction to predict Live Data.

* To predict on Train Set, Test Set we use only pipeline_clf, since the data is already processed.

Pipeline responsible for Data Cleaning and Feature Engineering.

In [None]:
pipeline_data_cleaning_feat_eng

In [None]:
joblib.dump(value=pipeline_data_cleaning_feat_eng ,
            filename=f"{file_path}/clf_pipeline_data_cleaning_feat_eng.pkl")

  Pipeline responsible for Feature Scaling, and Model

In [None]:
pipeline_clf

### Save the model as pkl compressed

In [None]:
import joblib
import gzip

# Save the model directly into a compressed gzip file
with gzip.open(f"{file_path}/clf_pipeline_model.pkl.gz", 'wb') as f_out:
    joblib.dump(pipeline_clf, f_out)

print(f"Model compressed and saved at: {file_path}/clf_pipeline_model.pkl.gz")

### Feature Importance plot

In [None]:
df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.show()

In [None]:
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')

---