<a href="https://colab.research.google.com/github/ryanhao1115/ML-for-Fraud-Detection/blob/main/5_4_RandomForest_SMOTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [294]:
!pip install imbalanced-learn



## Random Forest model with SMOTE
1. import new return label data.
2. SMOTE to address imbalance class issue.
4. Train model
5. Eval model
6. Create Risk-heatmap

In [419]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix,accuracy_score
import random
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


In [219]:
def reset_seeds():
    '''
    reset random seeds for modeling
    '''
    np.random.seed(1)
    random.seed(2)
    if tf.__version__[0] == '2':
        tf.random.set_seed(3)
    else:
        tf.set_random_seed(3)
    print("RANDOM SEEDS RESET")

In [5]:
## import dataset 
path = '/content/drive/MyDrive/Colab Notebooks/finalproject/'
file = path + 'sales_clean.csv'
df = pd.read_csv(file)

In [6]:
df = df.drop(columns='Unnamed: 0')

In [369]:
df.nunique()

distributor        639
sales              209
branch              28
inv_type             3
invoice_no       30721
product_no        1094
prod_cla             7
qty                202
total_amt        16112
sale_price        8598
ship_qty           202
cust_type           20
return               2
discount_app         2
list_price        6720
inv_ship_days      133
fraud                3
dtype: int64

## import return label data

In [8]:
## import fraud data
file = path + 'frauds.csv'
df_fraud = pd.read_csv(file,header=None)
df_fraud.columns = ['invoice_no']

In [9]:
## import return new labeled data
file = path + 'return.csv'
df_fraud2 = pd.read_csv(file,header=None)
df_fraud2.columns = ['invoice_no']

In [10]:
len(df_fraud2)

24

In [11]:
def label_fraud(df, df_fraud, df_fraud2):
  df['fraud'] = np.zeros(len(df))
  frauds_l = df_fraud['invoice_no'].to_list()
  frauds_l2 = df_fraud2['invoice_no'].to_list()
  df.loc[df['invoice_no'].isin(frauds_l),'fraud'] = 1
  df.loc[df['invoice_no'].isin(frauds_l2),'fraud'] = 2
  return df

In [12]:
df = label_fraud(df, df_fraud,df_fraud2)

In [13]:
df['fraud'].value_counts()

0.0    228014
2.0       433
1.0       154
Name: fraud, dtype: int64

## Encoding

In [317]:
df.nunique()

distributor        639
sales              209
branch              28
inv_type             3
invoice_no       30721
product_no        1094
prod_cla             7
qty                202
total_amt        16112
sale_price        8598
ship_qty           202
cust_type           20
return               2
discount_app         2
list_price        6720
inv_ship_days      133
fraud                3
dtype: int64

In [38]:
cols = df.columns.to_list()
cols = ['distributor', 'sales', 'branch', 'inv_type', 'invoice_no', 'product_no', 'prod_cla', 'cust_type']

In [248]:
## remove some fields with many classes
cols1 = ['distributor', 'sales', 'invoice_no', 'product_no']
df_less = df.drop(columns=cols1)

In [249]:
def field_onehotencoding(df):
  cols = [ 'branch', 'inv_type',  'prod_cla', 'cust_type']
  df[cols] = df[cols].astype('str') 
  df_enc = pd.get_dummies(df)
  return df_enc

In [250]:
df_less = field_onehotencoding(df_less)

In [251]:
df_less.shape

(228601, 67)

## Splite Train Test data


In [302]:
df_fraud = df_less[df_less['fraud'] == 1]
df_non_fraud = df_less[df_less['fraud'] == 0]
df_fraud2 = df_less[df_less['fraud'] == 2]

In [311]:
df_fraud2.shape

(433, 67)

In [303]:
df_non_fraud.shape

(228014, 67)

In [307]:
df_non_fraud.nunique()

qty                 202
total_amt         15896
sale_price         8237
ship_qty            202
return                2
                  ...  
cust_type_KX01        2
cust_type_LX01        2
cust_type_MX01        2
cust_type_NX01        2
cust_type_ZZ01        2
Length: 67, dtype: int64

In [312]:
def sampling_record(df_non_fraud,df_fraud,df_fraud2):
  '''
  Random sampling from unlabled records.
  concat with labeled records
  '''
  df = df_non_fraud.sample(frac=1,replace=False)

  sample_size = len(df_non_fraud) - 8000
 
  ## Build training dataset
  df_train = df.iloc[:sample_size,:]
  df_train = pd.concat([df_train,df_fraud2])
  df_train.loc[df_train['fraud'] == 2, 'fraud'] = 1 
  df_train = df_train.sample(frac=1,replace=False)
  print(len(df_train))

  ## Build eval dataset
  df_test = df.iloc[sample_size:,:]
  df_test = pd.concat([df_test,df_fraud])
  df_test = df_test.sort_index()
  print(len(df_test))

  return df_train, df_test


In [393]:
df_train, df_test = sampling_record(df_non_fraud, df_fraud, df_fraud2)

220447
8154


In [394]:
df_train.shape

(220447, 67)

In [395]:
df_test.shape

(8154, 67)

In [315]:
df_train.nunique()

qty                 202
total_amt         15737
sale_price         8375
ship_qty            202
return                2
                  ...  
cust_type_KX01        2
cust_type_LX01        2
cust_type_MX01        2
cust_type_NX01        2
cust_type_ZZ01        2
Length: 67, dtype: int64

## SMOTE 

In [396]:
y_train = df_train['fraud'].values
X_train = df_train.drop(columns=['fraud'])

In [397]:
# define pipeline
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_train, y_train = pipeline.fit_resample(X_train, y_train)



In [398]:
X_train.shape

(66003, 66)

In [371]:
y_train.sum()

22001.0

In [378]:
y_train.shape

(66003,)

In [402]:
X_test.shape

(8154, 66)

## Standardscale data

In [400]:
scaler = StandardScaler()

In [401]:
y_test = df_test['fraud'].values
X_test = df_test.drop(columns=['fraud'])

In [403]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [325]:
X_train.shape

(66003, 66)

In [326]:
y_test.shape

(8154, 1)

In [327]:
y_train.sum()

22001.0

## Build model

In [410]:
# define the  model
RF = RandomForestClassifier(random_state = 66)
score = cross_val_score(RF,X_train,y_train,cv=10).mean()
print('交叉验证得分: %.4f'%score)

交叉验证得分: 0.9985


In [404]:
## RandomForest oob_score test

RF1 = RandomForestClassifier(n_estimators=25,oob_score=True)
RF1 =RF1.fit(X_train,y_train)

RF1.oob_score_

0.9983940123933761

## Tuning hyperparameter 

In [409]:
RF = RandomForestClassifier(random_state = 66)
score = cross_val_score(RF,X_train,y_train,cv=10).mean()
print('基尼系数得分: %.4f'%score)
RF = RandomForestClassifier(criterion = 'entropy',random_state = 66)
score = cross_val_score(RF,X_train,y_train,cv=10).mean()
print('熵得分: %.4f'%score)

熵得分: 0.9985


In [411]:
## Tuning n_estimators
for i in range(50,150,10):
    DT = RandomForestClassifier(n_estimators = i,random_state = 66) #,criterion = 'entropy'
    score = cross_val_score(DT,X_train,y_train,cv=10).mean()
    print('n_estimators: ',i,'   score:',score)

n_estimators:  50    score: 0.9984850389977643
n_estimators:  60    score: 0.998515332846722
n_estimators:  70    score: 0.9984547382627976
n_estimators:  80    score: 0.9985001859222432
n_estimators:  90    score: 0.9985153374373947
n_estimators:  100    score: 0.9985304866572099
n_estimators:  110    score: 0.9985153374373947
n_estimators:  120    score: 0.9985001859222432
n_estimators:  130    score: 0.9985001859222432
n_estimators:  140    score: 0.998485036702428


In [414]:
## Tuning max_depth
for i in range(30,50,5):
    DT = RandomForestClassifier(n_estimators = 100,random_state = 66,max_depth =i ) 
    score = cross_val_score(DT,X_train,y_train,cv=10).mean()
    print('max_depth: ',i,'   score:',score)

max_depth:  30    score: 0.9985304866572099
max_depth:  35    score: 0.9985304866572099
max_depth:  40    score: 0.9985304866572099
max_depth:  45    score: 0.9985304866572099


In [416]:
RF = RandomForestClassifier(n_estimators=100, max_depth=25)
RF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=25, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Predict on eval dataset for further labeling.

In [405]:
y_pred = RF1.predict(X_test)

In [417]:
y_pred = RF.predict(X_test)

In [420]:
accuracy_score(y_test,y_pred)

0.9834437086092715

In [406]:
def eval_model(y_test, y_pred):
  print(roc_auc_score(y_test, y_pred))
  print(classification_report(y_test, y_pred))
  print(confusion_matrix(y_test, y_pred))


In [418]:
eval_model(y_test,y_pred)

0.5744253246753247
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      8000
         1.0       0.85      0.15      0.25       154

    accuracy                           0.98      8154
   macro avg       0.92      0.57      0.62      8154
weighted avg       0.98      0.98      0.98      8154

[[7996    4]
 [ 131   23]]


In [407]:
eval_model(y_test, y_pred)

0.5711785714285714
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      8000
         1.0       0.85      0.14      0.24       154

    accuracy                           0.98      8154
   macro avg       0.91      0.57      0.62      8154
weighted avg       0.98      0.98      0.98      8154

[[7996    4]
 [ 132   22]]


In [290]:
eval_model(y_test, y_cls)

0.6254853896103897
              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98      8000
         1.0       0.16      0.28      0.20       154

    accuracy                           0.96      8154
   macro avg       0.57      0.63      0.59      8154
weighted avg       0.97      0.96      0.96      8154

[[7774  226]
 [ 111   43]]


In [368]:
eval_model(y_test, y_cls)

0.5975243506493507
              precision    recall  f1-score   support

         0.0       0.98      0.99      0.99      8000
         1.0       0.38      0.20      0.26       154

    accuracy                           0.98      8154
   macro avg       0.68      0.60      0.63      8154
weighted avg       0.97      0.98      0.98      8154

[[7950   50]
 [ 123   31]]


In [422]:
len(RF.feature_importances_)

66

In [425]:
df_eval = df_test.drop(columns='fraud')

In [430]:
feature_imp = pd.Series(RF.feature_importances_,index=df_eval.columns).sort_values(ascending=False)
feature_imp.head(10)

inv_ship_days       0.199970
branch_720006       0.158623
sale_price          0.098963
qty                 0.072609
list_price          0.049043
cust_type_AB02      0.043817
branch_720019       0.042134
total_amt           0.041865
prod_cla_11020.0    0.028836
branch_720036       0.023003
dtype: float64

In [362]:
def export_result(df_test,y_cls, y_pred):
  '''
  Attach the predicte result (probability) into original records.
  Export to a csv file for further investication
  '''
  df_test['predict'] = y_cls
  df_test['prob']  = y_pred
  path = '/content/drive/MyDrive/Colab Notebooks/finalproject/'
  file_csv = path + 'pred_result.csv'
  df_test.to_csv(file_csv)
  return True


In [364]:
export_result(df_test,y_cls,y_pred)

True