In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.metrics import *

In [2]:
#np.random.seed(0) #Set to compare the difference across various experiements and reproduce the same results.

In [3]:
data=pd.read_csv('UofT_nodes.csv',
                 parse_dates=['BIRTH_DT','CUST_ADD_DT'])

# Drop Nan
Note- I also tried filling it with mean however there were no changes. 

In [4]:
data.dropna(inplace=True,axis=0)

In [5]:
col=['CASH_CNT_IN','CASH_SUM_OUT','CASH_SUM_IN','CASH_CNT_OUT', 'WIRES_SUM_IN', 'WIRES_CNT_IN',
       'WIRES_SUM_OUT', 'WIRES_CNT_OUT']

In [6]:
# ## Some the features have 0 and give inf or Nan when we divide
for i in col:
  data[i]=data[i]+1

## Some generic ideas to generate cash flow of person based on intuition ideas such as  


1.   Wired amount per month is exceedlingly high.
2.   Large deposits in cash/wire every transaction. 
3. Any regular pattern per month in wire in/out of money. 
etc. etc.
4. Trying to send more money than money being recevied. 
5. Financial fraud age group could be committed by people within age groups between 25-40. (etc)

Precision only improved by 0.02 from original score without these features. 

- Recall- precision inverse relationship. 
- new features could improve both these socres. 



In [7]:
data['cash_diff']=data['CASH_SUM_IN']-data['CASH_SUM_OUT']
data['wire_diff']=data['WIRES_SUM_IN']-data['WIRES_SUM_OUT']
data['cash_cnt_diff']=data['CASH_CNT_IN']-data['CASH_CNT_OUT']
data['wire_cnt_diff']=data['WIRES_CNT_IN']-data['WIRES_CNT_OUT']
data['avg_cash_transaction_out']=data['CASH_SUM_OUT']/data['CASH_CNT_OUT']
data['avg_wire_transaction_out']=data['WIRES_SUM_OUT']/data['WIRES_CNT_OUT']
data['avg_cash_transaction_in']=data['CASH_SUM_IN']/data['CASH_CNT_IN']
data['avg_wire_transaction_in']=data['WIRES_SUM_IN']/data['WIRES_CNT_IN']
data['monthly_cash_in']=data['CASH_SUM_IN']/12
data['monthly_cash_out']=data['CASH_SUM_OUT']/12
data['monthly_wire_in']=data['WIRES_SUM_IN']/12
data['monthly_wire_out']=data['WIRES_SUM_OUT']/12
data['diff_year']=data['CUST_ADD_DT'].dt.year-data['BIRTH_DT'].dt.year
data['age']=2023-data['BIRTH_DT'].dt.year
data['cust_age_bank']=2023-data['CUST_ADD_DT'].dt.year
data['total_money_left']=data['CASH_SUM_IN']+data['WIRES_SUM_IN']-data['CASH_SUM_OUT']-data['WIRES_SUM_OUT']

In [8]:
data.isna().sum()

BIRTH_DT                    0
CUST_ADD_DT                 0
OCPTN_NM                    0
RES_CNTRY_CA                0
CNTRY_OF_INCOME_CA          0
PEP_FL                      0
CASH_SUM_IN                 0
CASH_CNT_IN                 0
CASH_SUM_OUT                0
CASH_CNT_OUT                0
WIRES_SUM_IN                0
WIRES_CNT_IN                0
WIRES_SUM_OUT               0
WIRES_CNT_OUT               0
COUNTRY_RISK_INCOME         0
COUNTRY_RISK_RESIDENCY      0
RISK                        0
NAME                        0
GENDER                      0
CUSTOMER_ID                 0
cash_diff                   0
wire_diff                   0
cash_cnt_diff               0
wire_cnt_diff               0
avg_cash_transaction_out    0
avg_wire_transaction_out    0
avg_cash_transaction_in     0
avg_wire_transaction_in     0
monthly_cash_in             0
monthly_cash_out            0
monthly_wire_in             0
monthly_wire_out            0
diff_year                   0
age       

In [9]:
data.head()

Unnamed: 0,BIRTH_DT,CUST_ADD_DT,OCPTN_NM,RES_CNTRY_CA,CNTRY_OF_INCOME_CA,PEP_FL,CASH_SUM_IN,CASH_CNT_IN,CASH_SUM_OUT,CASH_CNT_OUT,...,avg_cash_transaction_in,avg_wire_transaction_in,monthly_cash_in,monthly_cash_out,monthly_wire_in,monthly_wire_out,diff_year,age,cust_age_bank,total_money_left
0,1981-09-01,2007-07-05,89.0,1,1,0.0,2578.785,3.0,2852.663,8.0,...,859.595,1123.412698,214.89875,237.721917,5897.916667,2020.416667,26,42,16,46256.122
1,1994-02-21,2019-05-19,89.0,1,1,0.0,3036.502,3.0,4806.997,16.0,...,1012.167333,3674.415584,253.041833,400.583083,23577.5,16370.0,25,29,4,84719.505
2,1962-11-16,2011-08-02,89.0,1,1,0.0,1618.571,2.0,3483.809,12.0,...,809.2855,1090.428571,134.880917,290.317417,636.083333,2799.916667,49,61,12,-27831.238
3,1998-06-20,2001-08-15,89.0,1,1,0.0,3588.042,6.0,1941.943,9.0,...,598.007,2072.972222,299.0035,161.828583,12437.833333,8824.75,3,25,22,45003.099
4,1942-01-24,2012-10-28,89.0,1,1,0.0,1726.524,3.0,13198.169,44.0,...,575.508,951.742268,143.877,1099.847417,7693.25,15617.0,70,81,11,-106556.645


In [10]:
data.columns

Index(['BIRTH_DT', 'CUST_ADD_DT', 'OCPTN_NM', 'RES_CNTRY_CA',
       'CNTRY_OF_INCOME_CA', 'PEP_FL', 'CASH_SUM_IN', 'CASH_CNT_IN',
       'CASH_SUM_OUT', 'CASH_CNT_OUT', 'WIRES_SUM_IN', 'WIRES_CNT_IN',
       'WIRES_SUM_OUT', 'WIRES_CNT_OUT', 'COUNTRY_RISK_INCOME',
       'COUNTRY_RISK_RESIDENCY', 'RISK', 'NAME', 'GENDER', 'CUSTOMER_ID',
       'cash_diff', 'wire_diff', 'cash_cnt_diff', 'wire_cnt_diff',
       'avg_cash_transaction_out', 'avg_wire_transaction_out',
       'avg_cash_transaction_in', 'avg_wire_transaction_in', 'monthly_cash_in',
       'monthly_cash_out', 'monthly_wire_in', 'monthly_wire_out', 'diff_year',
       'age', 'cust_age_bank', 'total_money_left'],
      dtype='object')

In [11]:
data.drop(['BIRTH_DT','CUST_ADD_DT','CUSTOMER_ID','NAME'],inplace=True,axis=1)

In [12]:
oc=pd.read_csv('UofT_occupation_risk.csv')

In [13]:
data["OCPTN_NM"]=data["OCPTN_NM"].map(oc.set_index("code")["occupation_risk"])

In [14]:
data['RISK'].value_counts()


low       593204
medium    346153
high       49463
Name: RISK, dtype: int64

## Biased only 5%  is high risk

## One hot encode

I guess it does not really matter if you encode before or after split since you are just coverting categorical variables into binary. Unlike normalization & standarization. 

In [15]:
x=data.loc[:, data.columns !='RISK']
y=data['RISK']

In [16]:
x.shape

(988820, 31)

In [17]:
data=pd.get_dummies(x)

In [18]:
data.shape

(988820, 38)

In [19]:
data.head()

Unnamed: 0,RES_CNTRY_CA,CNTRY_OF_INCOME_CA,PEP_FL,CASH_SUM_IN,CASH_CNT_IN,CASH_SUM_OUT,CASH_CNT_OUT,WIRES_SUM_IN,WIRES_CNT_IN,WIRES_SUM_OUT,...,OCPTN_NM_Low,OCPTN_NM_Moderate,COUNTRY_RISK_INCOME_High,COUNTRY_RISK_INCOME_Low,COUNTRY_RISK_INCOME_Moderate,COUNTRY_RISK_RESIDENCY_High,COUNTRY_RISK_RESIDENCY_Low,COUNTRY_RISK_RESIDENCY_Moderate,GENDER_Female,GENDER_Male
0,1,1,0.0,2578.785,3.0,2852.663,8.0,70775.0,63.0,24245.0,...,0,0,0,1,0,0,1,0,1,0
1,1,1,0.0,3036.502,3.0,4806.997,16.0,282930.0,77.0,196440.0,...,0,0,0,1,0,0,1,0,0,1
2,1,1,0.0,1618.571,2.0,3483.809,12.0,7633.0,7.0,33599.0,...,0,0,0,1,0,0,1,0,0,1
3,1,1,0.0,3588.042,6.0,1941.943,9.0,149254.0,72.0,105897.0,...,0,0,0,1,0,0,1,0,1,0
4,1,1,0.0,1726.524,3.0,13198.169,44.0,92319.0,97.0,187404.0,...,0,0,0,1,0,0,1,0,1,0


## Split in train-test

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.30,random_state=42)

In [21]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(692174, 38) (296646, 38) (692174,) (296646,)


In [22]:
y_test.value_counts()

low       178064
medium    103704
high       14878
Name: RISK, dtype: int64

In [23]:
y_train.value_counts()

low       415140
medium    242449
high       34585
Name: RISK, dtype: int64

## Now only work with x_train and y_train and take out only high and medium.

1subset- Undersample to have balanced dataset 

## Undersample the x_train

In [24]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
X_resampled, y_resampled=rus.fit_resample(X_train,y_train)

In [25]:
X_resampled.head()

Unnamed: 0,RES_CNTRY_CA,CNTRY_OF_INCOME_CA,PEP_FL,CASH_SUM_IN,CASH_CNT_IN,CASH_SUM_OUT,CASH_CNT_OUT,WIRES_SUM_IN,WIRES_CNT_IN,WIRES_SUM_OUT,...,OCPTN_NM_Low,OCPTN_NM_Moderate,COUNTRY_RISK_INCOME_High,COUNTRY_RISK_INCOME_Low,COUNTRY_RISK_INCOME_Moderate,COUNTRY_RISK_RESIDENCY_High,COUNTRY_RISK_RESIDENCY_Low,COUNTRY_RISK_RESIDENCY_Moderate,GENDER_Female,GENDER_Male
0,1,1,0.0,31046.093,18.0,44605.0,53.0,5628865.0,135.0,1943135.0,...,1,0,0,1,0,0,1,0,1,0
1,1,1,1.0,2755.352,3.0,45838.0,46.0,4630133.0,164.0,2272641.0,...,0,0,0,1,0,0,1,0,1,0
2,1,1,1.0,808.439,1.0,1850.0,8.0,7127475.0,134.0,3572961.0,...,0,0,0,1,0,0,1,0,1,0
3,1,1,1.0,15417.038,7.0,38640.0,30.0,2569453.0,147.0,2446838.0,...,1,0,0,1,0,0,1,0,1,0
4,1,1,0.0,34726.599,24.0,73759.0,88.0,1028779.0,76.0,418566.0,...,0,0,0,1,0,0,1,0,1,0


In [26]:
y_resampled.value_counts()

high      34585
low       34585
medium    34585
Name: RISK, dtype: int64

loss='log_loss', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0.0, max_bins=255, categorical_features=None, monotonic_cst=None, interaction_cst=None, warm_start=False, early_stopping='auto', scoring='loss', validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None, class_weight=None

In [27]:
model_1=HistGradientBoostingClassifier()
model_1.fit(X_resampled,y_resampled)

HistGradientBoostingClassifier()

## Evaluate model 1 on x_test

In [28]:
y_pred_model_1=model_1.predict(X_test)
print(classification_report(y_test,y_pred_model_1))

              precision    recall  f1-score   support

        high       0.42      0.91      0.57     14878
         low       1.00      1.00      1.00    178064
      medium       0.98      0.82      0.89    103704

    accuracy                           0.93    296646
   macro avg       0.80      0.91      0.82    296646
weighted avg       0.97      0.93      0.94    296646



## 2nd subset of data is high-medium

In [29]:
high_medium_index=y_train[(y_train=='high')|(y_train=='medium')].index
high_medium_y_train=y_train[(y_train=='high')|(y_train=='medium')]
high_medium_x_train=X_train.loc[high_medium_index]
print(high_medium_x_train.shape,high_medium_y_train.shape)

(277034, 38) (277034,)


In [30]:
rus2=RandomUnderSampler()

In [31]:
x_resampled_high, y_resampled_high=rus2.fit_resample(high_medium_x_train,high_medium_y_train)

In [32]:
y_resampled_high.value_counts()

high      34585
medium    34585
Name: RISK, dtype: int64

## Split dataset into train-test

In [33]:
x_med_high_train, x_med_high_test, y_med_high_train, y_med_high_test = train_test_split(x_resampled_high, y_resampled_high, test_size=0.20,random_state=42,stratify=y_resampled_high)

In [34]:
print(x_med_high_train.shape,x_med_high_test.shape,y_med_high_train.shape,y_med_high_test.shape)

(55336, 38) (13834, 38) (55336,) (13834,)


In [35]:
y_med_high_train.value_counts()

medium    27668
high      27668
Name: RISK, dtype: int64

In [36]:
model_2=HistGradientBoostingClassifier()
model_2.fit(x_med_high_train,y_med_high_train)

HistGradientBoostingClassifier()

## Evaluate model 2 on x_med_high_test

In [37]:
y_pred_2=model_2.predict(x_med_high_test)
print(classification_report(y_med_high_test,y_pred_2))

              precision    recall  f1-score   support

        high       0.83      0.92      0.87      6917
      medium       0.91      0.82      0.86      6917

    accuracy                           0.87     13834
   macro avg       0.87      0.87      0.87     13834
weighted avg       0.87      0.87      0.87     13834



In [38]:
# ## if model 1 gives medium run the results from model 2 and take its output and append to y_pred
# ## Else just append model 1 results to y_pred
# y_pred=[]
# for idx, row in (tqdm(X_test.iterrows())):
#     row_data = row.values.reshape(1, -1)
#     row_df = pd.DataFrame(row_data, columns=row.index)
#     output=model_1.predict(row_df)[0]
#     if output=='medium':
#       output_2=model_2.predict(row_df)[0]
#       y_pred.append(output_2)
#     else:
#       y_pred.append(output)

In [39]:
y_pred_model_1=model_1.predict(X_test)

In [40]:
out=pd.DataFrame({'model_1':y_pred_model_1},index=X_test.index)
#select only those index where the model_1 gives as medium 
med=out[out['model_1']=='medium']
#select from X_test for those index
idx=med.index
med_x_test=X_test.loc[idx]
## pass these data through model_2
y_pred_model_2=model_2.predict(med_x_test)
out_2=pd.DataFrame({'model_2':y_pred_model_2},index=idx)
#merge two df-- outer join 
df=out.merge(out_2,how='outer',left_index=True, right_index=True)
#fill missing values in model_2 with that of model_1. This is because model_2 will only have medium and high. 
#So low values should be filled
df['model_2'].fillna(df['model_1'],inplace=True)
#sort the index of the dataframe
df.sort_index(inplace=True)
#final predicted model outputs 
y_pred=df['model_2']
#sort the y_test index
y_test.sort_index(inplace=True)

## Check if two dataframe index are equal

In [41]:
y_test.index.equals(df.index)

True

In [42]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.40      0.93      0.56     14878
         low       1.00      1.00      1.00    178064
      medium       0.99      0.80      0.88    103704

    accuracy                           0.93    296646
   macro avg       0.80      0.91      0.81    296646
weighted avg       0.97      0.93      0.94    296646



## recall improves at the cost of precision- Inverse relationship ??

## Understand where is the model going wrong.

In [43]:
print(confusion_matrix(y_test,y_pred))

## row wise high,low,med
##col wise high,low,med

[[ 13802     31   1045]
 [   123 177941      0]
 [ 20713     43  82948]]


## Notes on confusion matrix

*   Mostly the model puts around 21k samples of medium risk into high risk which reduces the precision of high risk.
*  only 31 samples from high risk class are misclassfied into low( which is good).
* 1009 high risk samples are misclassfied into medium. Which is still better as classifying into low is much worse than medium



In [44]:
test_model=RandomForestClassifier()
test_model.fit(X_resampled,y_resampled)
importances = test_model.feature_importances_
feature=pd.DataFrame({'feature_name':X_resampled.columns,'score':importances})

In [45]:
feature.sort_values('score',inplace=True)

In [46]:
feature

Unnamed: 0,feature_name,score
32,COUNTRY_RISK_INCOME_Moderate,3.8e-05
35,COUNTRY_RISK_RESIDENCY_Moderate,5.8e-05
30,COUNTRY_RISK_INCOME_High,0.000172
31,COUNTRY_RISK_INCOME_Low,0.000201
1,CNTRY_OF_INCOME_CA,0.000306
34,COUNTRY_RISK_RESIDENCY_Low,0.000399
33,COUNTRY_RISK_RESIDENCY_High,0.000418
29,OCPTN_NM_Moderate,0.00051
0,RES_CNTRY_CA,0.000693
37,GENDER_Male,0.001186


In [47]:
# out=pd.DataFrame({'true':y_test.values,'pred':y_pred_model_1},index=y_test.index)

In [48]:
# misclassified=out[out['pred']!=out['true']]

In [49]:
# correct=out[out['pred']==out['true']]

In [50]:
# correct.value_counts()

In [51]:
# y_test.value_counts()

In [52]:
# out['pred'].value_counts()

In [53]:
# l=(180212-180145)
# m=(104916-86163)
# h=(14872-33692)
# print(l,m,h)

When Y_true is low what are the misclassified ones.

In [54]:
# y_true_low=misclassified[misclassified['true']=='low']
# y_true_low['pred'].value_counts()

## when y_true is medium what are the misclassified ones.

In [55]:
# y_true_high=misclassified[misclassified['true']=='medium']
# y_true_high['pred'].value_counts()

This means most of medium risk people are getting classified into high risk. Which is still okay and better than low risk.

In [56]:
# y_true_high=misclassified[misclassified['true']=='high']
# y_true_high['pred'].value_counts()

So model is still doing better as it assigning more values to medium than low. 

In [57]:
# misclassified['true'].value_counts()

In [58]:
# misclassified['pred'].value_counts()

In [59]:
# outliers=misclassified.index

In [60]:
# df=data.loc[outliers]

In [61]:
# df['RISK'].value_counts()

In [62]:
# conf_matrix=confusion_matrix(y_test,y_pred)
# print(conf_matrix)

# ##high low medium

In [63]:
# import numpy as np
# def extract_tp_fp_fn_tn_for_each_class(confusion_matrix):
#     num_classes = confusion_matrix.shape[0]
#     tp = np.zeros(num_classes, dtype=int)
#     fp = np.zeros(num_classes, dtype=int)
#     fn = np.zeros(num_classes, dtype=int)
#     tn = np.zeros(num_classes, dtype=int)
#     for i in range(num_classes):
#         # tp[i] = confusion_matrix[i][i]
#         fp[i] = np.sum(confusion_matrix[:, i]) - tp[i]
#         fn[i] = np.sum(confusion_matrix[i, :]) - tp[i]
#         tn[i] = np.sum(confusion_matrix) - tp[i] - fp[i] - fn[i]
#     return tp, fp, fn, tn


In [64]:
# tp,fp,fn,tn=extract_tp_fp_fn_tn_for_each_class(conf_matrix)

In [65]:
# tp

In [66]:
# fp

In [67]:
# fn

In [68]:
# tn