In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/fraud-detection-bank-dataset-20k-records-binary/fraud_detection_bank_dataset.csv')

In [None]:
df.head()

In [None]:
df=df.drop('Unnamed: 0', axis=1)

In [None]:
df.describe()

# 1. Choose the model by PyCaret

In [None]:
pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
exp1 = setup(df, target = 'targets', ignore_features = None,silent=True)

In [None]:
compare_models()

# I chose CatBoost Classifier as model

# 2. Create the model

In [None]:
cb = create_model('catboost')

In [None]:
tuned_cb = tune_model(cb)

In [None]:
plot_model(tuned_cb)

In [None]:
interpret_model(tuned_cb)

# 3. Tune Thres to hit both precision over 95% and highest recall

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=df.drop('targets',axis=1)
y=df['targets']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=10)

In [None]:
def make_cm(matrix, columns):
    n = len(columns)
    
    act = ['correct data'] * n
    pred = ['predict data'] * n

    cm = pd.DataFrame(matrix, 
        columns=[pred, columns], index=[act, columns])
    return cm

In [None]:
tuned_cb.fit(X_train,y_train)
y_pred=tuned_cb.predict(X_test)

from sklearn.metrics import confusion_matrix
df_matrix=make_cm(
    confusion_matrix(y_test,y_pred),['0','1'])
display(df_matrix)

from sklearn.metrics import precision_recall_fscore_support
precision,recall,fscore,_=precision_recall_fscore_support(y_test,y_pred,average='binary')
print(f' precision: {precision:.4f}  recall: {recall: .4f}  Fscore:  {fscore:.4f}')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
y_proba0=tuned_cb.predict_proba(X_test)[:,1]

y0=y_proba0[y_test==0]
y1=y_proba0[y_test==1]

plt.figure(figsize=(6,6))
plt.title('Bank Analysis')
sns.distplot(y1,kde=False,norm_hist=True,
             bins=50,color='b',label='1')
sns.distplot(y0,kde=False,norm_hist=True,
             bins=50,color='k',label='0')
plt.xlabel('thres')
plt.legend()
plt.show()

In [None]:
def pred(tuned_cb,X,thres):
  y_proba=tuned_cb.predict_proba(X)
  y_proba1=y_proba[:,1]
  y_pred=(y_proba1>thres).astype(int)
  return y_pred


thres_list=np.arange(0.8,0,-0.05)

for thres in thres_list:
  y_pred=pred(tuned_cb,X_test,thres)
  pred_sum=y_pred.sum()
  precision,recall,fscore,_=precision_recall_fscore_support(y_test,y_pred,average='binary')
  print(f' thres : {thres:.2f} pred_sum : {pred_sum} precision: {precision:.4f}  recall: {recall: .4f}  Fscore:  {fscore:.4f})')

# In CatBoost Classifier, when I set Thres=0.75, I can get precision 0.95 and max recall 0.75