Below Code calculates a feature score based on the feedback data for categorical field values. It is very similar to WOE and IV but with custom implementation. It provide a sample graph and table that recommend if a field value is suitable to be used as a feature.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import clear_output, display

clear_output()


# This function expects target column values to be in 0/1 format
# criticality is when a value has >= % min_criticality of counts in data
# certainty is when a value has >= % min_criticality difference between target counts
def cat_value_significance(df, cat_column, target_column, min_criticality=5, min_certainty=60):
  df1 = df.groupby(by=[cat_column,target_column]).size().reset_index(name="count")
  df1['count'] = df1['count'].astype(int)

  diff_df = df1[df1[target_column]==1].merge(df1[df1[target_column]==0], on=[cat_column], how="outer", suffixes=['_1','_0'])
  diff_df.fillna(0, inplace=True)
  diff_df.drop(columns=[target_column+"_0", target_column+"_1"], inplace=True)
  

  diff_df['Criticality'] = ((diff_df['count_0'] + diff_df['count_1'])/len(df))*100
  # diff_df[['count_1','count_0']].max(axis=1)
  diff_df['Certainty'] = (abs(diff_df['count_1'] - diff_df['count_0'])/(diff_df['count_0']+diff_df['count_1']))*100
    
  critical_values = diff_df[diff_df['Criticality'] >= min_criticality][cat_column].tolist()
  certain_values = diff_df[(diff_df['Certainty'] >= min_certainty) & (diff_df['Criticality'] >= min_criticality)][cat_column].tolist()
  
  
  plt.figure(figsize=(8,2))
  critical_df = df[df[cat_column].isin(critical_values)]
  g = sns.displot(data=critical_df, x=cat_column, hue=target_column, multiple="stack")
  g.set_xticklabels(rotation=90)
  display(g)

  # certain_df = df[df[cat_column].isin(certain_values)]
  # g = sns.displot(data=certain_df, x=cat_column, hue=target_column, kind="ecdf", )
  # g.set_xticklabels(rotation=90)
  # display(g)

  result = diff_df[diff_df[cat_column].isin(critical_values)]
  result['Focus'] = result.apply(lambda x: 'Risky' if x['count_1'] > x['count_0'] else 'Not Risky', axis=1)
  result['Feature Value Score'] = (result['Certainty'] * result['Criticality'])/100
  result = result.sort_values(by=['Feature Value Score'])
  result['Possible Feature'] = result['Feature Value Score'].apply(lambda x: True if x >= 5 else False)
  result['Feature Score'] = result[result['Possible Feature']==True]['Feature Value Score'].sum()/len(result)
  result.rename(columns={'Certainty':'Certainty(%)', 'Criticality':'Criticality(%)'}, inplace=True)
  clear_output()

  display(result)

In [None]:
cat_value_significance(input_df, 'column name', 'target column name')