**ANALYSIS QUESTIONS**

1. Discover relationship between Total Complexity Points (as of May 2021) and subsequent PRODUCTION, OUTFLOW, and NET. 9 months is probably preferred, but explore whether there are differences between 3 and 6 months.

2. Same as above, but for each individual Complexity input   (also, including the data on "Other Data" tab as separate variables).

3. Create simple model that tries to predict which clients will contribute over $1,000,000 in production in 9 months based on Complexity data only. Is there a "formula" for complexity that indicates that the client WILL, WON'T ever contribute significant production?

4. Explore link between Complexity Points and # of Meetings, Zooms, calls in subsequent 9 months.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTEN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.calibration import CalibrationDisplay
from IPython.display import display

In [None]:
# Set view options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
# Read in excel spreadsheets
complexity = pd.read_excel('../data/RohitCapstoneDataApr2022.xlsx', sheet_name = 'Complexity Data') \
    .rename(columns = {'rel_id': 'RelID'})

prod_outflow_columns = {
    'rel_id': 'RelID', 
    'CurrentRelValue': 'CurrentValue',
    'Beginning Rel Value': 'BeginningValue',
    'ThreeMonthProd': '3MonthProd',
    'SixMonthProd': '6MonthProd',
    'NineMonthProd': '9MonthProd',
    'ThreeMonthOutflow': '3MonthOutflow',
    'SixMonthOutflow': '6MonthOutflow',
    'NineMonthOutflow': '9MonthOutflow'
}
prod_outflow = pd.read_excel('../data/RohitCapstoneDataApr2022.xlsx', sheet_name = 'Prod-Outflow Data', skiprows = 1) \
    .rename(columns = prod_outflow_columns)[prod_outflow_columns.values()]

other = pd.read_excel('../data/RohitCapstoneDataApr2022.xlsx', sheet_name = 'Other Data as of 202105') \
    .drop(columns = 'AsOf')

meetings = pd.read_excel('../data/RohitCapstoneDataApr2022.xlsx', sheet_name = 'Subsequent Mtgs', converters = {'AsOf': str})
date_filter = ['202106', '202107', '202108', '202109', '202110', '202111', '202112', '202201', '202202', '202203', '202204']
meetings = meetings.loc[meetings['AsOf'].isin(date_filter)]
meetings['Year'] = meetings['AsOf'].str[0:4]
meetings['Month'] = meetings['AsOf'].str[4:6]
meetings['Interactions'] = meetings['Call'] + meetings['Meeting'] + meetings['Zoom']

In [None]:
# Show complexity rules
complexity_table = complexity[['RuleID', 'RuleType', '# Pts', 'Name']].sort_values('RuleID').drop_duplicates().reset_index(drop = True)
complexity_table = complexity_table.merge(complexity.value_counts('RuleID').to_frame('# RelIDs'), on = 'RuleID')
complexity_table
# complexity_table.style.set_properties(**{'text-align': 'left'}).set_table_styles([dict(selector = 'th', props=[('text-align', 'left')])])

In [None]:
# Filter accounts with BeginningValue less than $500,000
prod_outflow = prod_outflow[prod_outflow['BeginningValue'] >= 500000]

# Calculate net, net percentage, and percentage change variables
prod_outflow['3MonthNet'] = prod_outflow['3MonthProd'] + prod_outflow['3MonthOutflow']
prod_outflow['6MonthNet'] = prod_outflow['6MonthProd'] + prod_outflow['6MonthOutflow']
prod_outflow['9MonthNet'] = prod_outflow['9MonthProd'] + prod_outflow['9MonthOutflow']

prod_outflow['3MonthNetPct'] = prod_outflow['3MonthNet'] / prod_outflow['BeginningValue'] * 100
prod_outflow['6MonthNetPct'] = prod_outflow['6MonthNet'] / prod_outflow['BeginningValue'] * 100
prod_outflow['9MonthNetPct'] = prod_outflow['9MonthNet'] / prod_outflow['BeginningValue'] * 100

In [None]:
# Sum up complexity points per RelID
complexity_pts = complexity.groupby('RelID', as_index = False)['# Pts'].sum()

# Sum up client interactions per RelID
interactions = meetings.loc[meetings['CategoryName'] == 'Client Review'].groupby(['RelID'], as_index = False)['Interactions'].sum()

# Merge complexity_pts, prod_outflow, other, and interactions data
pts_prod_outflow = complexity_pts.merge(prod_outflow, on = 'RelID').set_index('RelID')
pts_prod_outflow_other = pts_prod_outflow.merge(other, on = 'RelID').set_index('RelID')
pts_prod_outflow_other_interactions = pts_prod_outflow_other.merge(interactions, how = 'left', on = 'RelID').set_index('RelID').fillna(0)

In [None]:
# Sum up complexity rule dummy variables per RelID
complexity_rules = pd.get_dummies(complexity.set_index('RelID')['ComplexityRuleID'].astype(str)).groupby('RelID').sum().reset_index()

# Create ordinal complexity rule variables
complexity_rules['2-3'] = complexity_rules['2'] * 1 + complexity_rules['3'] * 2
complexity_rules['4-5'] = complexity_rules['4'] * 1 + complexity_rules['5'] * 2
complexity_rules['6-7'] = complexity_rules['6'] * 1 + complexity_rules['7'] * 2
complexity_rules['8-9'] = complexity_rules['8'] * 1 + complexity_rules['9'] * 2
complexity_rules['10-13'] = complexity_rules['10'] * 1 + complexity_rules['11'] * 2 + complexity_rules['12'] * 3 + complexity_rules['13'] * 4
complexity_rules['14-16'] = complexity_rules['14'] * 1 + complexity_rules['15'] * 2 + complexity_rules['16'] * 3
complexity_rules['41-43'] = complexity_rules['41'] * 1 + complexity_rules['42'] * 2 + complexity_rules['43'] * 3
complexity_rules['49-51'] = complexity_rules['49'] * 1 + complexity_rules['50'] * 2 + complexity_rules['51'] * 3
complexity_rules['52-53'] = complexity_rules['52'] * 1 + complexity_rules['53'] * 2
complexity_rules['65'] = 1 - complexity_rules[['4', '5', '60']].sum(axis = 1)

# Merge complexity_rules, prod_outflow, other, and interactions data
rules_prod_outflow = complexity_rules.merge(prod_outflow, on = 'RelID').set_index('RelID')
rules_prod_outflow_other = rules_prod_outflow.merge(other, on = 'RelID').set_index('RelID')
rules_prod_outflow_other_interactions = rules_prod_outflow_other.merge(interactions, how = 'left', on = 'RelID').set_index('RelID').fillna(0)

In [None]:
# Define feature sets
all_rules = [
    '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15',
    '16', '17', '18', '19', '20', '21', '22', '23', '24', '26', '27', '28', '29', '30',
    '31', '32', '33', '37', '38', '39', '40', '41', '42', '43', '45', '49', '50', '51',
    '52', '53', '54', '55', '56', '57', '58', '60', '62', '63', '64'
]

collapsed_rules = [
    '1', '2-3', '4-5', '6-7', '8-9', '10-13', '14-16', '17', '18', '19',
    '20', '21', '22', '23', '24', '26', '27', '28', '29', '30', '31', 
    '32', '33', '37', '38', '39', '40', '41-43', '45', '49-51', '52-53',
    '54', '55', '56', '57', '58', '62', '63', '65'
]

In [None]:
# Logistic LASSO SMOTE pipeline
lr_l1_smote_pipeline = Pipeline(
    steps = [
        ('scaler', StandardScaler()),
        ('vt', VarianceThreshold()),
        ('logistic', LogisticRegression(penalty = 'l1', class_weight = 'balanced', solver = 'saga', max_iter = 1000))
    ]
)

In [None]:
# Perform train/test split (all complexity rules)
X = rules_prod_outflow_other_interactions
y = (rules_prod_outflow_other_interactions['9MonthProd'] >= 1000000).astype(int)
X_train_all_rules, X_test_all_rules, y_train_all_rules, y_test_all_rules = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 321)

# Oversample using SMOTE, then randomly undersample
X_train_all_rules, y_train_all_rules = SMOTEN(sampling_strategy = 0.1, k_neighbors = 5, random_state = 321).fit_resample(X_train_all_rules[all_rules], y_train_all_rules)
X_train_all_rules, y_train_all_rules = RandomUnderSampler(sampling_strategy = 0.55, random_state = 321).fit_resample(X_train_all_rules, y_train_all_rules)

# Run grid search using logistic LASSO SMOTE pipeline
grid_search_smote_all_rules = GridSearchCV(estimator = lr_l1_smote_pipeline, param_grid = {'logistic__C': [1, 0.5, 0.1, 0.05, 0.01]}, scoring = 'f1', cv = 3)
grid_search_smote_all_rules.fit(X_train_all_rules, y_train_all_rules)
grid_search_smote_all_rules.best_estimator_

In [None]:
# Output logistic LASSO SMOTE model results (all complexity rules)
print(classification_report(y_test_all_rules, grid_search_smote_all_rules.best_estimator_.predict(X_test_all_rules[all_rules]), zero_division = 0))
plt.show(ConfusionMatrixDisplay.from_estimator(grid_search_smote_all_rules.best_estimator_, X_test_all_rules[all_rules], y_test_all_rules))

ventiles_df = pd.DataFrame({
    'Probability': grid_search_smote_all_rules.best_estimator_.predict_proba(X_test_all_rules[all_rules])[:, 1],
    'Avg Prod': X_test_all_rules['9MonthProd'],
    'Median Prod': X_test_all_rules['9MonthProd'],
    '$1M+ Prod': y_test_all_rules,
    '$1M+ Prod %': y_test_all_rules
})

agg_dict = {
    'Avg Prod': 'mean',
    'Median Prod': 'median',
    '$1M+ Prod': 'sum',
    '$1M+ Prod %': 'sum',
    'Probability': 'mean',
}

ventiles_df['Interval'] = pd.qcut(ventiles_df['Probability'], q = 20, labels = [number for number in range(1, 21)], precision = 0)
ventiles_df['Ventile'] = pd.qcut(ventiles_df['Probability'], q = 20, precision = 0)
ventiles_df = ventiles_df.groupby(['Interval', 'Ventile']).agg(agg_dict).dropna()
ventiles_df['Odds'] = ventiles_df['Probability'] / (1 - ventiles_df['Probability'])
ventiles_df['$1M+ Prod %'] = ventiles_df['$1M+ Prod'][::-1].cumsum()[::-1] / ventiles_df['$1M+ Prod'].sum() * 100
display(ventiles_df)

coefficients = pd.DataFrame({
    'variable': ['intercept'] + list(np.array(X_train_all_rules[all_rules].columns)[grid_search_smote_all_rules.best_estimator_['vt'].get_support()]),
    'coefficient': [grid_search_smote_all_rules.best_estimator_['logistic'].intercept_[0]] + list(grid_search_smote_all_rules.best_estimator_['logistic'].coef_[0])
}).merge(complexity_table[['Name', 'ComplexityRuleID']].astype(str).set_index('ComplexityRuleID'), how = 'left', left_on = 'variable', right_on = 'ComplexityRuleID')
display(coefficients.sort_values('coefficient', ascending = False, key = abs).fillna('').head(25))

In [None]:
# Perform train/test split (all complexity rules, indeterminate set filter)
X = rules_prod_outflow_other_interactions
y = (rules_prod_outflow_other_interactions['9MonthProd'] >= 1000000).astype(int)
X_train_all_rules_indeterminate_set, X_test_all_rules_indeterminate_set, y_train_all_rules_indeterminate_set, y_test_all_rules_indeterminate_set = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 321)

# Filter out indeterminate set
indeterminate_filter = X_train_all_rules_indeterminate_set.loc[~X_train_all_rules_indeterminate_set['9MonthProd'].between(250000, 1000000)].index
X_train_all_rules_indeterminate_set, y_train_all_rules_indeterminate_set = X_train_all_rules_indeterminate_set.loc[indeterminate_filter], y_train_all_rules_indeterminate_set.loc[indeterminate_filter]

# Oversample using SMOTE, then randomly undersample
X_train_smote_all_rules_indeterminate_set, y_train_all_rules_indeterminate_set = SMOTEN(sampling_strategy = 0.1, k_neighbors = 5, random_state = 321).fit_resample(X_train_all_rules_indeterminate_set[all_rules], y_train_all_rules_indeterminate_set)
X_train_all_rules_indeterminate_set, y_train_all_rules_indeterminate_set = RandomUnderSampler(sampling_strategy = 0.55, random_state = 321).fit_resample(X_train_smote_all_rules_indeterminate_set, y_train_all_rules_indeterminate_set)

# Run grid search using logistic LASSO SMOTE pipeline
grid_search_smote_all_rules_indeterminate_set = GridSearchCV(estimator = lr_l1_smote_pipeline, param_grid = {'logistic__C': [1, 0.5, 0.1, 0.05, 0.01]}, scoring = 'f1', cv = 3)
grid_search_smote_all_rules_indeterminate_set.fit(X_train_all_rules_indeterminate_set, y_train_all_rules_indeterminate_set)
grid_search_smote_all_rules_indeterminate_set.best_estimator_

In [None]:
# Output logistic LASSO SMOTE model results (all complexity rules, indeterminate set filter)
print(classification_report(y_test_all_rules_indeterminate_set, grid_search_smote_all_rules.best_estimator_.predict(X_test_all_rules_indeterminate_set[all_rules]), zero_division = 0))
plt.show(ConfusionMatrixDisplay.from_estimator(grid_search_smote_all_rules.best_estimator_, X_test_all_rules_indeterminate_set[all_rules], y_test_all_rules_indeterminate_set))

ventiles_df = pd.DataFrame({
    'Probability': grid_search_smote_all_rules.best_estimator_.predict_proba(X_test_all_rules_indeterminate_set[all_rules])[:, 1],
    'Avg Prod': X_test_all_rules_indeterminate_set['9MonthProd'],
    'Median Prod': X_test_all_rules_indeterminate_set['9MonthProd'],
    '$1M+ Prod': y_test_all_rules_indeterminate_set,
    '$1M+ Prod %': y_test_all_rules_indeterminate_set
})

agg_dict = {
    'Avg Prod': 'mean',
    'Median Prod': 'median',
    '$1M+ Prod': 'sum',
    '$1M+ Prod %': 'sum',
    'Probability': 'mean',
}

ventiles_df['Interval'] = pd.qcut(ventiles_df['Probability'], q = 20, labels = [number for number in range(1, 21)], precision = 0)
ventiles_df['Ventile'] = pd.qcut(ventiles_df['Probability'], q = 20, precision = 0)
ventiles_df = ventiles_df.groupby(['Interval', 'Ventile']).agg(agg_dict).dropna()
ventiles_df['Odds'] = ventiles_df['Probability'] / (1 - ventiles_df['Probability'])
ventiles_df['$1M+ Prod %'] = ventiles_df['$1M+ Prod'][::-1].cumsum()[::-1] / ventiles_df['$1M+ Prod'].sum() * 100
display(ventiles_df)

coefficients = pd.DataFrame({
    'variable': ['intercept'] + list(np.array(X_train_all_rules_indeterminate_set[all_rules].columns)[grid_search_smote_all_rules.best_estimator_['vt'].get_support()]),
    'coefficient': [grid_search_smote_all_rules.best_estimator_['logistic'].intercept_[0]] + list(grid_search_smote_all_rules.best_estimator_['logistic'].coef_[0])
}).merge(complexity_table[['Name', 'ComplexityRuleID']].astype(str).set_index('ComplexityRuleID'), how = 'left', left_on = 'variable', right_on = 'ComplexityRuleID')
display(coefficients.sort_values('coefficient', ascending = False, key = abs).fillna('').head(25))

In [None]:
# Perform train/test split (collapsed complexity rules)
X = rules_prod_outflow_other_interactions
y = (rules_prod_outflow_other_interactions['9MonthProd'] >= 1000000).astype(int)
X_train_collapsed_rules, X_test_collapsed_rules, y_train_collapsed_rules, y_test_collapsed_rules = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 321)

# Oversample using SMOTE, then randomly undersample
X_train_smote_collapsed_rules, y_train_smote_collapsed_rules = SMOTEN(sampling_strategy = 0.1, k_neighbors = 5, random_state = 321).fit_resample(X_train_collapsed_rules[collapsed_rules], y_train_collapsed_rules)
X_train_final_collapsed_rules, y_train_final_collapsed_rules = RandomUnderSampler(sampling_strategy = 0.55, random_state = 321).fit_resample(X_train_smote_collapsed_rules, y_train_smote_collapsed_rules)

# Run grid search using logistic LASSO SMOTE pipeline
grid_search_smote_collapsed_rules = GridSearchCV(estimator = lr_l1_smote_pipeline, param_grid = {'logistic__C': [1, 0.5, 0.1, 0.05, 0.01]}, scoring = 'f1', cv = 3)
grid_search_smote_collapsed_rules.fit(X_train_final_collapsed_rules, y_train_final_collapsed_rules)
grid_search_smote_collapsed_rules.best_estimator_

In [None]:
# Output logistic LASSO SMOTE model results (collapsed complexity rules)
print(classification_report(y_test_collapsed_rules, grid_search_smote_collapsed_rules.best_estimator_.predict(X_test_collapsed_rules[collapsed_rules]), zero_division = 0))
plt.show(ConfusionMatrixDisplay.from_estimator(grid_search_smote_collapsed_rules.best_estimator_, X_test_collapsed_rules[collapsed_rules], y_test_collapsed_rules))

ventiles_df = pd.DataFrame({
    'Probability': grid_search_smote_collapsed_rules.best_estimator_.predict_proba(X_test_collapsed_rules[collapsed_rules])[:, 1],
    'Avg Prod': X_test_collapsed_rules['9MonthProd'],
    'Median Prod': X_test_collapsed_rules['9MonthProd'],
    '$1M+ Prod': y_test_collapsed_rules,
    '$1M+ Prod %': y_test_collapsed_rules
})

agg_dict = {
    'Avg Prod': 'mean',
    'Median Prod': 'median',
    '$1M+ Prod': 'sum',
    '$1M+ Prod %': 'sum',
    'Probability': 'mean',
}

ventiles_df['Interval'] = pd.qcut(ventiles_df['Probability'], q = 20, labels = [number for number in range(1, 21)], precision = 0)
ventiles_df['Ventile'] = pd.qcut(ventiles_df['Probability'], q = 20, precision = 0)
ventiles_df = ventiles_df.groupby(['Interval', 'Ventile']).agg(agg_dict).dropna()
ventiles_df['Odds'] = ventiles_df['Probability'] / (1 - ventiles_df['Probability'])
ventiles_df['$1M+ Prod %'] = ventiles_df['$1M+ Prod'][::-1].cumsum()[::-1] / ventiles_df['$1M+ Prod'].sum() * 100
display(ventiles_df)

coefficients = pd.DataFrame({
    'variable': ['intercept'] + list(np.array(X_train_collapsed_rules[collapsed_rules].columns)[grid_search_smote_collapsed_rules.best_estimator_['vt'].get_support()]),
    'coefficient': [grid_search_smote_collapsed_rules.best_estimator_['logistic'].intercept_[0]] + list(grid_search_smote_collapsed_rules.best_estimator_['logistic'].coef_[0])
}).merge(complexity_table[['Name', 'ComplexityRuleID']].astype(str).set_index('ComplexityRuleID'), how = 'left', left_on = 'variable', right_on = 'ComplexityRuleID')
display(coefficients.sort_values('coefficient', ascending = False, key = abs).fillna('').head(25))

In [None]:
# Perform train/test split (collapsed complexity rules, indeterminate set filter)
X = rules_prod_outflow_other_interactions
y = (rules_prod_outflow_other_interactions['9MonthProd'] >= 1000000).astype(int)
X_train_collapsed_rules_indeterminate_set, X_test_collapsed_rules_indeterminate_set, y_train_collapsed_rules_indeterminate_set, y_test_collapsed_rules_indeterminate_set = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 321)

# Filter out indeterminate set
indeterminate_filter = X_train_collapsed_rules_indeterminate_set.loc[~X_train_collapsed_rules_indeterminate_set['9MonthProd'].between(250000, 1000000)].index
X_train_collapsed_rules_indeterminate_set, y_train_collapsed_rules_indeterminate_set = X_train_collapsed_rules_indeterminate_set.loc[indeterminate_filter], y_train_collapsed_rules_indeterminate_set.loc[indeterminate_filter]

# Oversample using SMOTE, then randomly undersample
X_train_smote_collapsed_rules_indeterminate_set, y_train_smote_collapsed_rules_indeterminate_set = SMOTEN(sampling_strategy = 0.1, k_neighbors = 5, random_state = 321).fit_resample(X_train_collapsed_rules_indeterminate_set[collapsed_rules], y_train_collapsed_rules_indeterminate_set)
X_train_final_collapsed_rules_indeterminate_set, y_train_final_collapsed_rules_indeterminate_set = RandomUnderSampler(sampling_strategy = 0.55, random_state = 321).fit_resample(X_train_smote_collapsed_rules_indeterminate_set, y_train_smote_collapsed_rules_indeterminate_set)

# Run grid search using logistic LASSO SMOTE pipeline
grid_search_smote_collapsed_rules_indeterminate_set = GridSearchCV(estimator = lr_l1_smote_pipeline, param_grid = {'logistic__C': [1, 0.5, 0.1, 0.05, 0.01]}, scoring = 'f1', cv = 3)
grid_search_smote_collapsed_rules_indeterminate_set.fit(X_train_final_collapsed_rules_indeterminate_set, y_train_final_collapsed_rules_indeterminate_set)
grid_search_smote_collapsed_rules_indeterminate_set.best_estimator_

In [None]:
# Output logistic LASSO SMOTE model results (collapsed complexity rules, indeterminate set filter)
print(classification_report(y_test_collapsed_rules_indeterminate_set, grid_search_smote_collapsed_rules_indeterminate_set.best_estimator_.predict(X_test_collapsed_rules_indeterminate_set[collapsed_rules]), zero_division = 0))
plt.show(ConfusionMatrixDisplay.from_estimator(grid_search_smote_collapsed_rules_indeterminate_set.best_estimator_, X_test_collapsed_rules_indeterminate_set[collapsed_rules], y_test_collapsed_rules_indeterminate_set))

ventiles_df = pd.DataFrame({
    'Probability': grid_search_smote_collapsed_rules_indeterminate_set.best_estimator_.predict_proba(X_test_collapsed_rules_indeterminate_set[collapsed_rules])[:, 1],
    'Avg Prod': X_test_collapsed_rules_indeterminate_set['9MonthProd'],
    'Median Prod': X_test_collapsed_rules_indeterminate_set['9MonthProd'],
    '$1M+ Prod': y_test_collapsed_rules_indeterminate_set,
    '$1M+ Prod %': y_test_collapsed_rules_indeterminate_set
})

agg_dict = {
    'Avg Prod': 'mean',
    'Median Prod': 'median',
    '$1M+ Prod': 'sum',
    '$1M+ Prod %': 'sum',
    'Probability': 'mean',
}

ventiles_df['Interval'] = pd.qcut(ventiles_df['Probability'], q = 20, labels = [number for number in range(1, 21)], precision = 0)
ventiles_df['Ventile'] = pd.qcut(ventiles_df['Probability'], q = 20, precision = 0)
ventiles_df = ventiles_df.groupby(['Interval', 'Ventile']).agg(agg_dict).dropna()
ventiles_df['Odds'] = ventiles_df['Probability'] / (1 - ventiles_df['Probability'])
ventiles_df['$1M+ Prod %'] = ventiles_df['$1M+ Prod'][::-1].cumsum()[::-1] / ventiles_df['$1M+ Prod'].sum() * 100
display(ventiles_df)

coefficients = pd.DataFrame({
    'variable': ['intercept'] + list(np.array(X_train_collapsed_rules_indeterminate_set[collapsed_rules].columns)[grid_search_smote_collapsed_rules_indeterminate_set.best_estimator_['vt'].get_support()]),
    'coefficient': [grid_search_smote_collapsed_rules_indeterminate_set.best_estimator_['logistic'].intercept_[0]] + list(grid_search_smote_collapsed_rules_indeterminate_set.best_estimator_['logistic'].coef_[0])
}).merge(complexity_table[['Name', 'ComplexityRuleID']].astype(str).set_index('ComplexityRuleID'), how = 'left', left_on = 'variable', right_on = 'ComplexityRuleID')
display(coefficients.sort_values('coefficient', ascending = False, key = abs).fillna('').head(25))

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (8, 5), dpi = 300)
CalibrationDisplay.from_estimator(estimator = grid_search_smote_all_rules.best_estimator_, X = X_test_all_rules[all_rules], y = y_test_all_rules, n_bins = 20, strategy = 'quantile', name = 'Logistic LASSO SMOTE | All Rules', markersize = 3, ax = ax)
CalibrationDisplay.from_estimator(estimator = grid_search_smote_all_rules_indeterminate_set.best_estimator_, X = X_test_all_rules_indeterminate_set[all_rules], y = y_test_all_rules_indeterminate_set, n_bins = 20, strategy = 'quantile',  name = 'Logistic LASSO SMOTE | All Rules | Indeterminate Set', markersize = 3, ax = ax)
CalibrationDisplay.from_estimator(estimator = grid_search_smote_collapsed_rules.best_estimator_, X = X_test_collapsed_rules[collapsed_rules], y = y_test_collapsed_rules, n_bins = 20, strategy = 'quantile',  name = 'Logistic LASSO SMOTE | Collapsed Rules', markersize = 3, ax = ax)
CalibrationDisplay.from_estimator(estimator = grid_search_smote_collapsed_rules_indeterminate_set.best_estimator_, X = X_test_collapsed_rules_indeterminate_set[collapsed_rules], y = y_test_collapsed_rules_indeterminate_set, n_bins = 20, strategy = 'quantile',  name = 'Logistic LASSO SMOTE | Collapsed Rules | Indeterminate Set', markersize = 3, ax = ax)
plt.legend(loc = 'upper left')
plt.show()