# Import

Import all necessary packages

In [1]:
import pandas as pd
import numpy as np
import joblib

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import time


from sklearn.model_selection import train_test_split

from scipy.stats import zscore

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, plot_roc_curve, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb

# Load & Split Dataset 

Load and split dataset:
<li>- Make each processing step as a function</li>
<li>- Make main funtion</li>
<li>- Create parameters to load & split the data</li>

In [2]:
def read_data(path,
              set_index = None):
    '''
    Read data from data folder in csv format.
    
    Parameters
    ----------
    path: str
          path to data
    
    '''
    
    data = pd.read_csv(path, index_col = set_index)

    return data

def split_input_output(dataset,
                       target_column):
    '''
    Function to split input and output data
    '''
    output_df = dataset[target_column]
    input_df = dataset.drop([target_column],
                            axis = 1)

    return output_df, input_df

In [3]:
def exclude_di(input_data):
    '''
    Function to load only IVF data
    '''
    data = input_data.copy()
    data = data[data['Type of treatment - IVF or DI'] != 'DI']
    
    return data

def select_feats(input_data, selected_cols):
    '''
    Function to select only the necessary feature
    '''
    data = input_data.copy()
    data = data[selected_cols]
    
    return data

def main_data(input_data, selected_cols):
    '''
    Main function to load data
    '''
    data = exclude_di(input_data)
    data = select_feats(data, selected_cols)
    
    return data

def impute_target(input_data, target_col):
    '''
    Function to impute target variable
    '''
    data = input_data.copy()
    data[target_col] = data[target_col].fillna(0)
    
    return data

In [4]:
def split_train_test(x, y, TEST_SIZE):
    # Do not forget to stratify if classification
    x_train, x_test,\
        y_train, y_test = train_test_split(x,
                                           y,
                                           test_size=TEST_SIZE,
                                           random_state=123,
                                           stratify=y)

    return x_train, x_test, y_train, y_test


def split_data(data_input, data_ouput, TEST_SIZE=0.2):

    x_train, x_test, \
        y_train, y_test = split_train_test(
            data_input,
            data_ouput,
            TEST_SIZE)

    x_train, x_valid, \
        y_train, y_valid = split_train_test(
            x_train,
            y_train,
            TEST_SIZE)

    return x_train, y_train, \
        x_valid, y_valid, \
        x_test, y_test

In [5]:
params = {'file_loc1': '../data/ar-2010-2014-csv.csv',
          'file_loc2': '../data/ar-2015-2016-csv.csv',
          'target_column': 'Live Birth Occurrence',
         'feats': ['Patient Age at Treatment',
        'Total Number of Previous IVF cycles',
        'Total number of IVF pregnancies',
        'Total number of live births - conceived through IVF',
        'Type of Infertility - Female Primary',
        'Type of Infertility - Female Secondary',
        'Type of Infertility - Male Primary',
        'Type of Infertility - Male Secondary',
        'Type of Infertility -Couple Primary',
        'Type of Infertility -Couple Secondary',
        'Cause  of Infertility - Tubal disease',
        'Cause of Infertility - Ovulatory Disorder',
        'Cause of Infertility - Male Factor',
        'Cause of Infertility - Patient Unexplained',
        'Cause of Infertility - Endometriosis',
        'Cause of Infertility - Cervical factors',
        'Cause of Infertility - Female Factors',
        'Cause of Infertility - Partner Sperm Concentration',
        'Cause of Infertility -  Partner Sperm Morphology',
        'Causes of Infertility - Partner Sperm Motility',
        'Cause of Infertility -  Partner Sperm Immunological factors',
        'Stimulation used',
        'Egg Source',
        'Sperm From', 
        'Fresh Cycle', 
        'Frozen Cycle', 
        'Eggs Thawed',
        'Fresh Eggs Collected', 
        'Eggs Mixed With Partner Sperm',
        'Embryos Transfered',
        'Live Birth Occurrence'],
        'test_size': 0.2,
        'out_path': '../output/'
         }

## Main Function

Main function for loading and splitting dataset:
<li>- Input & output data split</li>
<li>- Train-Valid-Test split with stratification</li>

In [6]:
def main_load(params):
    df1 = read_data(params['file_loc1'])
    df2 = read_data(params['file_loc2'])
    data = pd.concat([df1, df2])
    data = main_data(data,params['feats'])
    data = impute_target(data,params['target_column'])

    output_df, input_df = split_input_output(data,
                                         params['target_column'])

    X_train, y_train, X_valid, y_valid, X_test, y_test = split_data(input_df,
                                                                output_df,
                                                                params['test_size'])
    
    joblib.dump(X_train, params["out_path"]+"x_train.pkl")
    joblib.dump(y_train, params["out_path"]+"y_train.pkl")
    joblib.dump(X_valid, params["out_path"]+"x_valid.pkl")
    joblib.dump(y_valid, params["out_path"]+"y_valid.pkl")
    joblib.dump(X_test, params["out_path"]+"x_test.pkl")
    joblib.dump(y_test, params["out_path"]+"y_test.pkl")
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [7]:
x_train, y_train, x_valid, y_valid, x_test, y_test = main_load(params)

In [8]:
y_test.shape

(92672,)

In [9]:
# check if stratify is correct
print(y_train.value_counts(normalize = True))
print('--------------')
print(y_valid.value_counts(normalize = True))
print('--------------')
print(y_test.value_counts(normalize = True))

0.0    0.751784
1.0    0.248216
Name: Live Birth Occurrence, dtype: float64
--------------
0.0    0.751787
1.0    0.248213
Name: Live Birth Occurrence, dtype: float64
--------------
0.0    0.751791
1.0    0.248209
Name: Live Birth Occurrence, dtype: float64


# Data Preprocessing & Engineering

<b>Data preprocessing:</b>

<li>- Make each processing step as a function</li>
<li>- Make main function</li>
<li>- Create parameters for preprocessing</li>
<li></li>
<b>Feature engineering:</b>

<li>- Generate dummy variable</li>
<li>- Normalize the data using zscore</li>
<li>- Undersampling for training data</li>

In [10]:
def to_numeric(input_data, do=True):
    '''
    Function to convert string to numerical data
    '''
    data = input_data.copy()
    
    # replace '> 50' with 51
    data['Fresh Eggs Collected'] = data['Fresh Eggs Collected'].replace(['> 50'],[51])
    data['Eggs Mixed With Partner Sperm'] = data['Eggs Mixed With Partner Sperm'].replace(['> 50'],[51])
    
    # replace '>=5' with 6
    data['Total Number of Previous IVF cycles'] = data['Total Number of Previous IVF cycles'].replace(['>=5'],[6])
    data['Total number of IVF pregnancies'] = data['Total number of IVF pregnancies'].replace(['>=5'],[6])
    
    # convert to numerical data
    data['Fresh Eggs Collected'] = pd.to_numeric(data['Fresh Eggs Collected'])
    data['Eggs Mixed With Partner Sperm'] = pd.to_numeric(data['Eggs Mixed With Partner Sperm'])
    data['Total Number of Previous IVF cycles'] = pd.to_numeric(data['Total Number of Previous IVF cycles'])
    data['Total number of IVF pregnancies'] = pd.to_numeric(data['Total number of IVF pregnancies'])
    
    return data

def replace_age(input_data, cats, do=True):
    '''
    Function to categorize age input
    '''
    data = input_data.copy()
    data.drop(data[data['Patient Age at Treatment'] == '999'].index, inplace = True)
    data['Patient Age at Treatment'] = data['Patient Age at Treatment'].replace(cats)
    
    return data

def get_dummies(input_data, col, do=True):
    '''
    Function to generate dummy variable
    '''
    data = input_data.copy()
    if 'Sperm From' in input_data.columns:
        data = pd.get_dummies(data, columns=col, prefix=col)
    
    else:
        input_data
    
    return data

def replace_eggsrc(input_data, do=True):
    '''
    Function to categorize egg source input
    '''
    data = input_data.copy()
    data['Egg Source'] = data['Egg Source'].replace(['Patient','Donor'],[0,1])
    
    return data

def remove_cols(input_data, cols, do=True):
    '''
    Function to remove unecessary columns
    '''
    data = input_data.copy()
    data = data.drop(columns=cols)
    
    return data

def undersampling(x_train, y_train):
    '''
    Function to undersampling train data
    '''
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X_train, y_train = undersample.fit_resample(x_train, y_train)
    
    return X_train, y_train

In [11]:
# compile all dict & lists needed
params_preprocessing = {'age_replace': {'18 - 34':0, '35-37':1, '38-39':2, '40-42':3, '43-44':4, '45-50':5},
                        'to_dummy': ['Sperm From'],
                        'to_remove': ['Cause of Infertility - Female Factors',
                                      'Cause of Infertility -  Partner Sperm Immunological factors',
                                      'Type of Infertility -Couple Primary', 
                                      'Type of Infertility - Male Primary', 
                                      'Frozen Cycle', 'Fresh Cycle', 
                                      'Sperm From_Partner', 
                                      'Total number of live births - conceived through IVF', 
                                      'Eggs Mixed With Partner Sperm'],
                       'to_numeric': True,
                        'replace_age': True,
                        'get_dummies': True,
                        'replace_eggsrc': True,
                        'remove_cols': True,
                        'undersampling': True,
                       'out_path': '../output/'}

In [12]:
def preprocess(input_data,params):
    """
    A function to execute the preprocessing steps.
    
    Args:
    - df_in(DataFrame): Input dataframe
    - params(dict): preprocessing parameters
    
    Return:
    - df(DataFrame): preprocessed data
    """
    data = input_data.copy()
    data = to_numeric(data, params['to_numeric'])
    data = replace_age(data, params['age_replace'], params['replace_age'])
    data = get_dummies(data, params['to_dummy'], params['get_dummies'])
    data = replace_eggsrc(data, params['replace_eggsrc'])
    data = remove_cols(data, params['to_remove'], params['remove_cols'])
    
    return data

In [13]:
def main_eng(x_train, y_train, x_valid, y_valid, x_test, y_test, params):
    
    # concat data
    df_train = pd.concat([x_train, pd.DataFrame(y_train)], axis = 1)
    df_valid = pd.concat([x_valid, pd.DataFrame(y_valid)], axis = 1)
    df_test = pd.concat([x_test, pd.DataFrame(y_test)], axis = 1)
    
    df_list = [df_train, df_valid, df_test]
    df_preprocessed = []
    
    for x in df_list:
        temp = preprocess(x, params)
        df_preprocessed.append(temp)
        
    X_train_ready = df_preprocessed[0].drop(columns=['Live Birth Occurrence'], axis=1)
    y_train_ready = df_preprocessed[0]['Live Birth Occurrence']
    X_valid_ready = df_preprocessed[1].drop(columns=['Live Birth Occurrence'], axis=1)
    y_valid_ready = df_preprocessed[1]['Live Birth Occurrence']
    X_test_ready = df_preprocessed[2].drop(columns=['Live Birth Occurrence'], axis=1)
    y_test_ready = df_preprocessed[2]['Live Birth Occurrence']
    
    X_train_ready = zscore(X_train_ready)
    X_valid_ready = zscore(X_valid_ready)
    X_test_ready = zscore(X_test_ready)
    X_train_ready, y_train_ready = undersampling(X_train_ready, y_train_ready)
    
    
    joblib.dump(X_train_ready, params["out_path"]+"X_train_ready.pkl")
    joblib.dump(y_train_ready, params["out_path"]+"y_train_ready.pkl")
    joblib.dump(X_valid_ready, params["out_path"]+"X_valid_ready.pkl")
    joblib.dump(y_valid_ready, params["out_path"]+"y_valid_ready.pkl")
    joblib.dump(X_test_ready, params["out_path"]+"X_test_ready.pkl")
    joblib.dump(y_test_ready, params["out_path"]+"y_test_ready.pkl")
    
    return X_train_ready, y_train_ready, X_valid_ready, y_valid_ready, X_test_ready, y_test_ready 

In [14]:
X_train_ready, y_train_ready, X_valid_ready, y_valid_ready, X_test_ready, y_test_ready = main_eng(x_train, 
                                                                                                  y_train, 
                                                                                                  x_valid, 
                                                                                                  y_valid, 
                                                                                                  x_test, 
                                                                                                  y_test,
                                                                                                 params_preprocessing)

In [15]:
print('x_train_ready: ',X_train_ready.shape)
print('x_valid_ready: ',X_valid_ready.shape)
print('x_test_ready: ',X_test_ready.shape)

x_train_ready:  (146466, 24)
x_valid_ready:  (73057, 24)
x_test_ready:  (91401, 24)


# Modeling

Searching for the best model and hyperparameter:
<li>- Generate baseline model (Logistic Regression, Random Forest, Decision Tree, XGBoost)</li>
<li>- Select the best model</li>
<li>- Hyperparameter tuning</li>

In [16]:
def select_model(X_train,y_train,X_valid,y_valid, params):
    
    logreg = LogisticRegression
    rf = RandomForestClassifier
    tree = DecisionTreeClassifier
    XGB_ = xgb.XGBClassifier
    
    train_log_dict = {'model': [logreg(), rf(), tree(), XGB_()],
                      'for_tuning': [logreg, rf, tree, XGB_], 
                      'model_name': [],
                      'model_fit': [],
                      'model_score': []}
    
    #try
    for model in train_log_dict['model']:
        base_model = model
        train_log_dict['model_name'].append(base_model.__class__.__name__)
    
    for model in train_log_dict['model']:
        base_model = model
        train_log_dict['model_fit'].append(base_model.fit(X_train,y_train))
    
    for model in train_log_dict['model_fit']:
        fitted_model = model
        train_log_dict['model_score'].append((2*(roc_auc_score(y_train, fitted_model.predict_proba(X_train)[:, 1])))-1)
        
    best_model_index = train_log_dict['model_score'].index(max(train_log_dict['model_score']))
    best_model = train_log_dict['model'][best_model_index]
    best_model_ = train_log_dict['for_tuning'][best_model_index]
                                           
    
    print("Gini Performance Evaluation\n")
    print(f"Logistic Regression Gini : {train_log_dict['model_score'][0]}")
    print(f"Random Forest Gini       : {train_log_dict['model_score'][1]}")
    print(f"Decision Tree Gini       : {train_log_dict['model_score'][2]}")
    print(f"XGBoost Gini : {train_log_dict['model_score'][3]}")
    print('')
    print(f"Best Model : {best_model}")
   
    #hyperparameter tuning
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=69)

    # Define search space
    space_tree = dict()

    space_tree['max_depth'] = [2, 3, 5, 10, 20] #DT
    space_tree['min_samples_leaf'] = [5, 10, 20, 50, 100] #DT
    space_tree['criterion'] = ["gini", "entropy"] #DT

    # Define search
    search_tree = RandomizedSearchCV(tree(), space_tree, n_iter=30, scoring='roc_auc', n_jobs=30, cv=cv, random_state=69)

    # Execute search
    result_tree = search_tree.fit(X_train, y_train)
    
    best_params = {'max_depth': result_tree.best_params_['max_depth'],
                  'min_samples_leaf': result_tree.best_params_['min_samples_leaf'],
                  'criterion': result_tree.best_params_['criterion']}

    print('Best Score tree: %s' % ((result_tree.best_score_ * 2) - 1))
    print('Best Hyperparameters: %s' % result_tree.best_params_)
    
    model_ = best_model_(max_depth= best_params['max_depth'], min_samples_leaf = best_params['min_samples_leaf'], criterion = best_params['criterion']).fit(X_train,y_train)
    
    def evaluate(true,predicted):
        f1 = f1_score(true,predicted)
        roc_auc = roc_auc_score(true,predicted)
    
        return f1,roc_auc
    
    f1, roc_auc = evaluate(y_valid, model_.predict(X_valid))
    
    print("F1 Score: ", f1)
    print("ROC AUC Score: ", roc_auc)
    
    joblib.dump(model_, params['out_path']+'best_model.pkl')
    joblib.dump(train_log_dict, params['out_path']+'train_log.pkl')
    
    return model_

In [17]:
param_model={'out_path': "../model/"}

In [18]:
best_model = select_model(X_train_ready,y_train_ready,X_valid_ready,y_valid_ready, param_model)

Gini Performance Evaluation

Logistic Regression Gini : 0.3515170835691861
Random Forest Gini       : 0.6278679455629468
Decision Tree Gini       : 0.6325256821463663
XGBoost Gini : 0.45786356768610026

Best Model : DecisionTreeClassifier()
Best Score tree: 0.41790244315588154
Best Hyperparameters: {'min_samples_leaf': 100, 'max_depth': 10, 'criterion': 'gini'}
F1 Score:  0.48928915210546065
ROC AUC Score:  0.6529007299702863


# Prediction

Prediction function with dataframe as an input:

In [19]:
def df_constructor(input):
    df = pd.DataFrame(input, index=[0])
    return df

In [20]:
data = {'Patient Age at Treatment': '18 - 34',
        'Total Number of Previous IVF cycles': '0',
        'Total number of IVF pregnancies': 0,                               
        'Total number of live births - conceived through IVF': 0,    
        'Type of Infertility - Female Primary':0,                         
        'Type of Infertility - Female Secondary':0,                        
        'Type of Infertility - Male Primary':0,                          
        'Type of Infertility - Male Secondary':0,                      
        'Type of Infertility -Couple Primary':0,                        
        'Type of Infertility -Couple Secondary':0,                     
        'Cause  of Infertility - Tubal disease':1,                     
        'Cause of Infertility - Ovulatory Disorder':0,                 
        'Cause of Infertility - Male Factor':0,                         
        'Cause of Infertility - Patient Unexplained':0,                  
        'Cause of Infertility - Endometriosis':0,                        
        'Cause of Infertility - Cervical factors':0,                     
        'Cause of Infertility - Female Factors':0,                      
        'Cause of Infertility - Partner Sperm Concentration':0,        
        'Cause of Infertility -  Partner Sperm Morphology':0,          
        'Causes of Infertility - Partner Sperm Motility':0,              
        'Cause of Infertility -  Partner Sperm Immunological factors':0,   
        'Stimulation used':0,                                         
        'Egg Source': 'Patient',                                                                                           
        'Fresh Cycle': 0,                                               
        'Frozen Cycle':1,                                                
        'Eggs Thawed':0,                                            
        'Fresh Eggs Collected': '14',                                       
        'Eggs Mixed With Partner Sperm':'14',                              
        'Embryos Transfered':2,  
        'Sperm From_Donor':1,                                   
        'Sperm From_Partner & Donor':0,                          
        'Sperm From_not assigned':0}

In [21]:
# compile all dict & lists needed
params_predict = {'age_replace': {'18 - 34':0, '35-37':1, '38-39':2, '40-42':3, '43-44':4, '45-50':5},
                        'to_dummy': ['Sperm From'],
                        'to_remove': ['Cause of Infertility - Female Factors',
                                      'Cause of Infertility -  Partner Sperm Immunological factors',
                                      'Type of Infertility -Couple Primary', 
                                      'Type of Infertility - Male Primary', 
                                      'Frozen Cycle', 'Fresh Cycle', 
                                      'Total number of live births - conceived through IVF', 
                                      'Eggs Mixed With Partner Sperm'],
                       'to_numeric': True,
                        'replace_age': True,
                        'get_dummies': True,
                        'replace_eggsrc': True,
                        'remove_cols': True,
                        'undersampling': True,
                       'out_path': '../output/'}

In [22]:
def main_predict(data, model, params_preprocess):
    df = df_constructor(data)
    df_preprocessed = preprocess(df, params_preprocess)
    
    code2rel = {0: 'Not Occured', 1: 'Occured'}
    proba = model.predict_proba(df_preprocessed)[:,1]
    predict = 1 if proba > 0.5 else 0
    
    return code2rel[predict], proba

In [23]:
model = joblib.load('../model/best_model.pkl')

In [24]:
predict, proba = main_predict(data,model,params_predict)

In [25]:
predict

'Occured'

In [26]:
proba

array([0.60487805])

In [27]:
y_train[0]

1.0