### Import libraries

In [1]:
import pandas as pd
from constants import target_col as target_name, index_col as index_name, dt_algo, rf_algo, lr_algo, svm_algo
from utilities import do_modeling, get_evaluation_report

### Configure defaults

In [2]:
index_col = index_name.lower()
target_col = [target_name]

### Read data

In [3]:
train_std = pd.read_csv('./outputs/std/train.csv').set_index(index_col)
test_std = pd.read_csv('./outputs/std/test.csv').set_index(index_col)

train_mm = pd.read_csv('./outputs/min_max/train.csv').set_index(index_col)
test_mm = pd.read_csv('./outputs/min_max/test.csv').set_index(index_col)

### Data check

In [4]:
train_std.head()

Unnamed: 0_level_0,attrition_flag,gender_F,gender_M,customer_age,dependent_count,months_on_book,total_relationship_count,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolving_bal,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
clientnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
713796033,1,0,1,0.085971,-0.271041,0.761682,1.409797,0.661411,0.498335,-0.427523,-1.423834,-0.541924,-0.736571,-1.228105,-0.892927,-0.997729
771204108,0,1,0,-1.290336,-0.271041,-1.623249,-0.527633,-1.331855,-2.214038,-0.734047,0.023954,0.489055,-0.149068,0.092368,0.025604,1.201739
721399833,0,1,0,-0.414504,-0.271041,-0.242499,-0.527633,-1.331855,0.498335,-0.725459,-0.186031,0.543798,0.098177,0.90169,0.49751,0.811452
778601733,1,1,0,1.086921,0.499385,1.389296,0.763987,1.658044,0.498335,-0.653342,0.650224,-0.208908,-0.598231,-1.313297,0.324759,1.296575
714047733,0,1,0,0.836684,0.499385,1.13825,-1.173443,-0.335222,-1.309914,-0.405063,1.178255,-0.190661,0.89996,0.859094,-0.454728,0.567066


In [5]:
train_mm.head()

Unnamed: 0_level_0,attrition_flag,gender_F,gender_M,customer_age,dependent_count,months_on_book,total_relationship_count,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolving_bal,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
clientnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
713796033,1,0,1,0.446809,0.4,0.674419,1.0,0.5,0.5,0.099756,0.0,0.188696,0.076306,0.20155,0.140017,0.0
771204108,0,1,0,0.212766,0.4,0.232558,0.4,0.166667,0.0,0.015591,0.468415,0.255225,0.187479,0.44186,0.201064,0.603604
721399833,0,1,0,0.361702,0.4,0.488372,0.4,0.166667,0.5,0.017949,0.400477,0.258758,0.234265,0.589147,0.232428,0.496496
778601733,1,1,0,0.617021,0.6,0.790698,0.8,0.666667,0.5,0.037751,0.671037,0.210185,0.102484,0.186047,0.220947,0.62963
714047733,0,1,0,0.574468,0.6,0.744186,0.2,0.333333,0.166667,0.105923,0.841875,0.211363,0.385986,0.581395,0.16914,0.429429


In [6]:
print('Standardized train set shape:', train_std.shape, 'test set shape:', test_std.shape)
print('MinMax scaled train set shape:', train_mm.shape, 'test set shape:', test_mm.shape)

Standardized train set shape: (8607, 16) test set shape: (1520, 16)
MinMax scaled train set shape: (8607, 16) test set shape: (1520, 16)


In [7]:
print('NA count by column and dataset')

NA_df = pd.DataFrame()
NA_df['train_standardized'] = train_std.isna().sum()
NA_df['test_standardized'] = test_std.isna().sum()
NA_df['train_minmax'] = train_mm.isna().sum()
NA_df['test_minmax'] = test_mm.isna().sum()
NA_df.T

NA count by column and dataset


Unnamed: 0,attrition_flag,gender_F,gender_M,customer_age,dependent_count,months_on_book,total_relationship_count,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolving_bal,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
train_standardized,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
test_standardized,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
train_minmax,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
test_minmax,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Separate target

In [8]:
X_train_std, y_train_std = train_std.drop(columns=target_col), train_std[target_name]
X_test_std, y_test_std = test_std.drop(columns=target_col), test_std[target_name]

X_train_mm, y_train_mm = train_mm.drop(columns=target_col), train_mm[target_name]
X_test_mm, y_test_mm = test_mm.drop(columns=target_col), test_mm[target_name]

In [9]:
std_models = do_modeling(X_train_std, y_train_std)
mm_models = do_modeling(X_train_mm, y_train_mm)

In [10]:
std_models, mm_models

({'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
  'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
  'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
  'Support Vector Machine': SVC(class_weight='balanced', random_state=42)},
 {'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
  'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
  'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
  'Support Vector Machine': SVC(class_weight='balanced', random_state=42)})

In [11]:
std_eval_report = get_evaluation_report(std_models, X_test_std, y_test_std)
mm_eval_report = get_evaluation_report(mm_models, X_test_mm, y_test_mm)

In [12]:
std_eval_report

Unnamed: 0,algorithm,class,precision,recall,f1,support
0,Decision Tree,0,0.971383,0.95768,0.964483,1276
1,Decision Tree,1,0.793893,0.852459,0.822134,244
2,Random Forest,0,0.968364,0.983542,0.975894,1276
3,Random Forest,1,0.90625,0.831967,0.867521,244
4,Logistic Regression,0,0.963866,0.836207,0.89551,1276
5,Logistic Regression,1,0.493947,0.836066,0.621005,244
6,Support Vector Machine,0,0.983939,0.912226,0.946726,1276
7,Support Vector Machine,1,0.667656,0.922131,0.774527,244


In [13]:
std_eval_report.query('`class` == 1').style.background_gradient(cmap="YlGnBu")

Unnamed: 0,algorithm,class,precision,recall,f1,support
1,Decision Tree,1,0.793893,0.852459,0.822134,244
3,Random Forest,1,0.90625,0.831967,0.867521,244
5,Logistic Regression,1,0.493947,0.836066,0.621005,244
7,Support Vector Machine,1,0.667656,0.922131,0.774527,244


In [14]:
mm_eval_report.query('`class` == 1').style.background_gradient(cmap="YlGnBu")

Unnamed: 0,algorithm,class,precision,recall,f1,support
1,Decision Tree,1,0.796935,0.852459,0.823762,244
3,Random Forest,1,0.90625,0.831967,0.867521,244
5,Logistic Regression,1,0.492788,0.840164,0.621212,244
7,Support Vector Machine,1,0.627119,0.909836,0.742475,244


In [15]:
# print('\nStandardized train set')
# print(train_std.isna().sum())
# print('\nStandardized test set')
# print(test_std.isna().sum())
# print('\nMinMax scaled train set')
# print(train_mm.isna().sum())
# print('\nMinMax scaled test set')
# print(test_mm.isna().sum())

# dt_std = DecisionTreeClassifier().fit(X_train_std, y_train_std)
# dt_std_pred = dt_std.predict(X_test_std)

# print_evaluation_summary(y_test_std, dt_std_pred, dt_algo)

# plot_feature_importance(dt_std.feature_importances_, dt_algo, X_train_std.columns)


# dt_mm = DecisionTreeClassifier().fit(X_train_mm, y_train_mm)
# dt_mm_pred = dt_mm.predict(X_test_mm)

# print_evaluation_summary(y_test_mm, dt_mm_pred, dt_algo)

# plot_feature_importance(dt_mm.feature_importances_, dt_algo, X_train_mm.columns)

# type(X_test_mm)
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from utilities import print_evaluation_summary, plot_feature_importance

# print('{} with standardized input\n'.format(dt_algo))
# dt_std = model_and_evaluate(X_train_std, y_train_std, X_test_std, y_test_std, dt_algo)

# print('{} with minmax scaled input'.format(dt_algo))
# dt_mm = model_and_evaluate(X_train_mm, y_train_mm, X_test_mm, y_test_mm, dt_algo)

# print('{} with standardized input\n'.format(rf_algo))
# dt_std = model_and_evaluate(X_train_std, y_train_std, X_test_std, y_test_std, rf_algo)

# import matplotlib.pyplot as plt
# plt.style.use('fivethirtyeight')

# _ = mm_eval_report.pivot(index='algorithm', columns='class')
# _.columns = pd.MultiIndex.from_product([['precision', 'recall', 'f1', 'support'],[0, 1]], names=['metric', 'class'])
# _