In [None]:
def tree_binning(df,train,target,var,model='clf',max_depth,add_missing = True):
    
    
    """
    Docstring : Bins the Numerical variable by using the Decision tree classifier and then checks for the monotonic
    relation with the target variable.Allows to change the max_depth to 2,3,4 so as establish the monotonic relation.
    
    Parameters:
    -----------
    df : Dataframe 
    The dataset for which variable binning to be performed like train or test
    
    train : Dataframe 
    The training data for deciding the bins 
    
    target : str
    The target variable name
    
    var : str
    The name of numerical variable to be binned
    
    model : str , default 'clf'
    type of problem - classification or regression
    For classification related analysis. use 'classification' or 'clf'
    For regression related analysis. use 'regression' or 'reg'
    
    max_depth : int , default 3
    The depth of the decision tree.
    After binning if the realtion is not monotonic, try 2,4
      
    add_missing : boolean , default True 
    if True it will replace missing values by 'Missing'
       
    Returns:
    --------
        Dataframe with the binned variable
    """
    
    # Predicting the Probability value and filtering for the category 1
    
    tree_model = DecisionTreeClassifier(max_depth=max_depth)
    df1=train[[target,var]].copy()
    df1.dropna(inplace=True)
    tree_model.fit(df1[var].to_frame(), df1[target])
    df1[var+"_tree"]=tree_model.predict_proba(df1[var].to_frame())[:,1]
    print('\n\n The Probability of events are : \n')
    print(df1[var+"_tree"].unique())
    tree.plot_tree(tree_model)
    
    # Creating the bins based on the predicted probability values
    
    print('\n \n The Bins created are : \n')
    df_bin=pd.concat( [df1.groupby([var+"_tree"])[var].min(),df1.groupby([var+"_tree"])[var].max()] ,axis=1)
    df_bin.columns=[var+"_min",var+"_max"]
    df_bin.sort_values(by=[var+'_min'],inplace=True)
    print(df_bin)
    df_bin[var+"_max"].iloc[-1]=np.inf
    print('\n\n The Bins after including right limit as infinity : \n')
    print(df_bin)
    bins = pd.IntervalIndex.from_tuples([(df_bin[var+'_min'][i],df_bin[var+'_max'][i]) for i in df_bin.index])
    print('\n\nThe bins created are : \n')
    print(bins)
    df[var+'_bin']=pd.cut(df[var],bins,include_lowest=True)

    # Checking for the monotonic relation with the target variable after binning
    
    eda_df = df.copy(deep=True)
    length_df = len(eda_df)
        
    if add_missing:
        eda_df[var+'_bin'] = eda_df[var+'_bin'].cat.add_categories('Missing')
        eda_df[var+'_bin'] = eda_df[var+'_bin'].fillna('Missing')
        
    plot_df =  calculate_mean_target_per_category (eda_df, var+'_bin', target)
    #print(plot_df)
    plot_df.sort_index(axis = 0,inplace=True) 
    #print(plot_df)
    cat_order = list(plot_df[var+'_bin'])
    #print(cat_order)

    if model in('clf' or 'classification'):
        plot_df[target] = 100*plot_df[target]

        

    fig, ax = plt.subplots(figsize=(12,4))
    plt.xticks(plot_df.index, plot_df[var+'_bin'], rotation = 90)

    ax.bar(plot_df.index, plot_df['perc'], align = 'center', color = 'lightgrey')
        
        
    ax2 = ax.twinx()
    ax2 = sns.pointplot(data = plot_df, x=var+'_bin', y=target, order = cat_order, color='green')


    #for x, y, s in zip(plot_df[var], plot_df[target],plot_df[target]):
    #plt.text(x - 0.04,y + 0.02,s fontdict={'size': 14})

    if model in('clf' or 'classification'):
        ax.set_title(f'Event Rate of target ({target}) within each category of variable ({var})', fontsize=17)
        ax2.set_ylabel("Perc of Events within Category", fontsize=14)
        #ax.set_xlabel(var, fontsize=14)
        ax.set_ylabel('Perc of Categories', fontsize=14)

    elif model in('reg' or 'regression'):
        ax.set_title(f'Mean value of target ({target}) within each category of variable ({var})', fontsize=17)
        ax2.set_ylabel('Mean Target Value', fontsize=14) 
        #ax.set_xlabel(var, fontsize=14)
        ax.set_ylabel('Perc of Categories', fontsize=14)
               

    plt.show()

    display_all(plot_df.set_index(var+'_bin').transpose())
    
    return (df)

In [None]:
# For the train data

train_sample_raw=tree_binning(train_sample_raw,train_sample_raw,'isFraud','D1')

In [None]:
# For the test data

test_sample_raw=tree_binning(test_sample_raw,train_sample_raw,'isFraud','D1')