In [27]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier


# import seaborn as sns




warnings.filterwarnings('ignore')


In [14]:
df = pd.read_csv('./kaggle/credit_card_fraud_detection/creditcard.csv')

def get_preprocessed_df(df=None):
    df_copy = df.copy()
    df_copy.drop('Time', axis=1, inplace=True)
    return df_copy

def get_train_test_dataset(df=None):
    df_copy = get_preprocessed_df(df)
    X = df_copy.iloc[:, :-1]
    Y = df_copy.iloc[:, -1]
    X_train, X_test, Y_train, Y_test = train_test_split(X ,Y, test_size=0.3, stratify=Y)
    return X_train, X_test, Y_train, Y_test

def get_clf_eval(Y_test, pred, pred_proba=None):
    confusion = confusion_matrix(Y_test, pred)
    acc = accuracy_score(Y_test, pred)
    precision = precision_score(Y_test, pred)
    recall = recall_score(Y_test, pred)
    f1 = f1_score(Y_test, pred)
    roc_auc = roc_auc_score(Y_test, pred_proba)
    print('Confusion matrix')
    print(confusion)
    print('acc: {:.4f}, precision: {:.4f}, recall: {:.4f}, F1: {:.4f}, roc_auc: {:.4f}'.format(acc, precision, recall, f1, roc_auc))
    # print(f'acc: {acc:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, F1: {f1:.4f}')

In [16]:
X_train, X_test, Y_train, Y_test = get_train_test_dataset(df)

In [18]:
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train, Y_train)
lr_pred = lr_clf.predict(X_test)
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]
get_clf_eval(Y_test, lr_pred, lr_pred_proba)

Confusion matrix
[[85282    13]
 [   54    94]]
acc: 0.9992, precision: 0.8785, recall: 0.6351, F1: 0.7373, roc_auc: 0.9734


In [24]:
def get_model_train_eval(model, X_train, X_test, Y_train, Y_test):
    model.fit(X_train, Y_train)
    pred = model.predict(X_test)
    pred_proba = model.predict_proba(X_test)[:, -1]
    get_clf_eval(Y_test, pred, pred_proba)

In [20]:
get_model_train_eval(lr_clf, X_train, X_test, Y_train, Y_test)

Confusion matrix
[[85282    13]
 [   54    94]]
acc: 0.9992, precision: 0.8785, recall: 0.6351, F1: 0.7373, roc_auc: 0.9734


In [25]:
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=1, boost_from_average=False)
get_model_train_eval(lgbm_clf, X_train, X_test, Y_train, Y_test)

Confusion matrix
[[85287     8]
 [   32   116]]
acc: 0.9995, precision: 0.9355, recall: 0.7838, F1: 0.8529, roc_auc: 0.9806


In [None]:
# sns.histplot(df['Amount', bins=100, kde=True])

In [30]:
def get_preprocessed_df(df=None):
    from sklearn.preprocessing import StandardScaler
    df_copy = df.copy()
    scaler = StandardScaler()
    amount_n = scaler.fit_transform(df_copy['Amount'].values.reshape(-1, 1))
    df_copy.insert(0, 'Amount_Scaled', amount_n)
    df_copy.drop('Time', axis=1, inplace=True)
    return df_copy
X_train, X_test, Y_train, Y_test = get_train_test_dataset(df)

In [33]:
lr_clf = LogisticRegression(max_iter=1000)
get_model_train_eval(lr_clf, X_train, X_test, Y_train, Y_test)

Confusion matrix
[[85282    13]
 [   54    94]]
acc: 0.9992, precision: 0.8785, recall: 0.6351, F1: 0.7373, roc_auc: 0.9732


In [34]:
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=1, boost_from_average=False)
get_model_train_eval(lgbm_clf, X_train, X_test, Y_train, Y_test)

Confusion matrix
[[85289     6]
 [   24   124]]
acc: 0.9996, precision: 0.9538, recall: 0.8378, F1: 0.8921, roc_auc: 0.9797


In [65]:
def get_preprocessed_df(df=None):
    import numpy as np

    df_copy = df.copy()
    
    amount_n = np.log1p(df_copy['Amount'].values.reshape(-1, 1))
    df_copy.insert(0, 'Amount_Scaled', amount_n)
    df_copy.drop('Time', axis=1, inplace=True)
    return df_copy
X_train, X_test, Y_train, Y_test = get_train_test_dataset(df)

In [40]:
lr_clf = LogisticRegression(max_iter=1000)
get_model_train_eval(lr_clf, X_train, X_test, Y_train, Y_test)

Confusion matrix
[[85282    13]
 [   51    97]]
acc: 0.9993, precision: 0.8818, recall: 0.6554, F1: 0.7519, roc_auc: 0.9746


In [41]:
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=1, boost_from_average=False)
get_model_train_eval(lgbm_clf, X_train, X_test, Y_train, Y_test)

Confusion matrix
[[85290     5]
 [   25   123]]
acc: 0.9996, precision: 0.9609, recall: 0.8311, F1: 0.8913, roc_auc: 0.9758


In [45]:
corr = df.corr()

In [59]:
def get_outlier(df, column, weight=1.5):
    import numpy as np
    fraud = df[df['Class']==1][column]
    q_25 = np.percentile(fraud.values, 25)
    q_75 = np.percentile(fraud.values, 75)
    print(q_25, q_75)
    iqr = q_75 - q_25
    iqr_weight = iqr * weight
    lowest_val = q_25 - iqr_weight
    highest_val = q_75 + iqr_weight
    outlier_index = fraud[(fraud < lowest_val) | (fraud > highest_val)].index
    return outlier_index

In [60]:
get_outlier(df, 'V14')

-9.692722964972386 -4.282820849486865


Int64Index([8296, 8615, 9035, 9252], dtype='int64')

In [67]:
def get_preprocessed_df(df=None):
    import numpy as np

    df_copy = df.copy()
    
    amount_n = np.log1p(df_copy['Amount'].values.reshape(-1, 1))
    df_copy.insert(0, 'Amount_Scaled', amount_n)
    df_copy.drop('Time', axis=1, inplace=True)
    outlier_index = get_outlier(df, 'V14')
    df_copy.drop(outlier_index, inplace=True)
    return df_copy
X_train, X_test, Y_train, Y_test = get_train_test_dataset(df)

-9.692722964972386 -4.282820849486865


In [68]:
lr_clf = LogisticRegression(max_iter=1000)
get_model_train_eval(lr_clf, X_train, X_test, Y_train, Y_test)

Confusion matrix
[[85287     8]
 [   56    90]]
acc: 0.9993, precision: 0.9184, recall: 0.6164, F1: 0.7377, roc_auc: 0.9681


In [69]:
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=1, boost_from_average=False)
get_model_train_eval(lgbm_clf, X_train, X_test, Y_train, Y_test)

Confusion matrix
[[85293     2]
 [   32   114]]
acc: 0.9996, precision: 0.9828, recall: 0.7808, F1: 0.8702, roc_auc: 0.9816


In [71]:
smote = SMOTE(random_state=0)
X_train_over, Y_train_over = smote.fit_resample(X_train, Y_train)
X_train_over.shape, Y_train_over.shape

((398040, 30), (398040,))

In [72]:
Y_train.value_counts(), Y_train_over.value_counts(), 

(0    199020
 1       342
 Name: Class, dtype: int64,
 0    199020
 1    199020
 Name: Class, dtype: int64)

In [73]:
lr_clf = LogisticRegression(max_iter=1000)
get_model_train_eval(lr_clf, X_train_over, X_test, Y_train_over, Y_test)

Confusion matrix
[[83798  1497]
 [   13   133]]
acc: 0.9823, precision: 0.0816, recall: 0.9110, F1: 0.1498, roc_auc: 0.9760


In [74]:
def precision_recall_curve_plot(y_test , pred_proba_c1):
    import matplotlib.pyplot as plt
    import matplotlib.ticker as tkicker
    from sklearn.metrics import precision_recall_curve

    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

precision_recall_curve_plot(Y_test, lr_clf.predict_proba(X_test)[:, 1])

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "d:\Users\Pringles\anaconda3\envs\hrd\lib\site-packages\IPython\core\interactiveshell.py", line 3398, in run_code
  File "C:\Users\Pringles\AppData\Local\Temp\ipykernel_12580\2560497996.py", line 24, in <cell line: 24>
    precision_recall_curve_plot(Y_test, lr_clf.predict_proba(X_test)[:, 1])
  File "C:\Users\Pringles\AppData\Local\Temp\ipykernel_12580\2560497996.py", line 2, in precision_recall_curve_plot
    import matplotlib.pyplot as plt
  File "d:\Users\Pringles\anaconda3\envs\hrd\lib\site-packages\matplotlib\pyplot.py", line 31, in <module>
    import matplotlib.colorbar
  File "d:\Users\Pringles\anaconda3\envs\hrd\lib\site-packages\matplotlib\colorbar.py", line 36, in <module>
    import matplotlib.contour as contour
  File "d:\Users\Pringles\anaconda3\envs\hrd\lib\site-packages\matplotlib\contour.py", line 23, in <module>
    import matplotlib.text as text
  File "d:\Users\Pringles\anaconda3\envs\hrd\lib\site-packages\matplotlib\text.p

In [75]:
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=1, boost_from_average=False)
get_model_train_eval(lgbm_clf, X_train_over, X_test, Y_train_over, Y_test)

Confusion matrix
[[85279    16]
 [   28   118]]
acc: 0.9995, precision: 0.8806, recall: 0.8082, F1: 0.8429, roc_auc: 0.9779
