<a href="https://colab.research.google.com/github/ryanhao1115/ML-for-Fraud-Detection/blob/main/4_1_activelearning_ordinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Active Learning with Ordinal encoding data
1. Ordinal encoding categorical fileds
2. Process the labeled fraud invoice
3. Sample data for active learning
4. Train model
5. Choose top 100 uncertain invoice
6. Get back the labeled data

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense

In [104]:
## import dataset 
path = '/content/drive/MyDrive/Colab Notebooks/finalproject/'
file = path + 'sales_clean.csv'
df = pd.read_csv(file)

In [107]:
df = df.drop(columns='Unnamed: 0')

In [116]:
df.tail()

Unnamed: 0,distributor,sales,branch,inv_type,invoice_no,product_no,prod_cla,qty,total_amt,sale_price,ship_qty,cust_type,return,discount_app,list_price,inv_ship_days,fraud
228596,A15020494,PJ036429,720018,Z038,2111414500,BFD37B45,11020.0,1,83.59,83.59,1,AA02,0,1,94.46,3,0.0
228597,A15020494,PJ036429,720018,Z038,2111414500,BRC1H611,11020.0,1,417.95,417.95,1,AA02,0,1,472.28,3,0.0
228598,A15020494,PJ036429,720018,Z038,2111414500,FPDAP28BAP,11020.0,1,4188.73,4188.73,1,AA02,0,1,4733.26,3,0.0
228599,A15050131,PJ040864,720037,Z038,2111414525,BRC301B611,11050.0,1,398.23,398.23,1,AB02,0,1,450.0,4,0.0
228600,A15050131,PJ040864,720037,Z038,2111414525,VAML350HV1,11050.0,1,7654.87,7654.87,1,AB02,0,1,8650.0,4,0.0


## import label data and sampling

In [111]:
## import fraud data
file = path + 'frauds.csv'
df_fraud = pd.read_csv(file,header=None)

In [113]:
df_fraud.columns = ['invoice_no']

In [114]:
def label_fraud(df, df_fraud):
  df['fraud'] = np.zeros(len(df))
  frauds_l = df_fraud['invoice_no'].to_list()
  df.loc[df['invoice_no'].isin(frauds_l),'fraud'] = 1
  return df

In [115]:
df = label_fraud(df, df_fraud)

In [117]:
df['fraud'].value_counts()

0.0    228447
1.0       154
Name: fraud, dtype: int64

## Encoding

In [118]:
cols = df.columns.to_list()
cols = ['distributor', 'sales', 'branch', 'inv_type', 'invoice_no', 'product_no', 'prod_cla', 'cust_type']

In [119]:
def field_encoding(df):
  '''
  Ordinal encode categorical fields.
  '''
  cols = ['distributor', 'sales', 'branch', 'inv_type', 'invoice_no', 'product_no', 'prod_cla', 'cust_type']
  df[cols] = df[cols].astype('str') 
  enc = LabelEncoder()
  for f in cols:
    df[f] = enc.fit_transform(df[f])
  return df

In [120]:
df_enc = field_encoding(df)

In [121]:
df_enc.nunique()

distributor        639
sales              209
branch              28
inv_type             3
invoice_no       30721
product_no        1094
prod_cla             7
qty                202
total_amt        16112
sale_price        8598
ship_qty           202
cust_type           20
return               2
discount_app         2
list_price        6720
inv_ship_days      133
fraud                2
dtype: int64

In [122]:
df_enc.head()

Unnamed: 0,distributor,sales,branch,inv_type,invoice_no,product_no,prod_cla,qty,total_amt,sale_price,ship_qty,cust_type,return,discount_app,list_price,inv_ship_days,fraud
0,637,39,0,0,0,513,6,1,1000.0,1000.0,0,0,0,0,1000.0,239,0.0
1,450,21,10,1,1,984,0,4,130973.44,32743.36,0,8,1,1,37000.0,237,0.0
2,118,110,23,1,2,57,1,1,398.23,398.23,0,0,1,1,450.0,225,0.0
3,118,110,23,1,2,552,0,1,2831.86,2831.86,0,0,1,1,3200.0,225,0.0
4,68,142,23,1,3,57,1,4,1592.92,398.23,0,0,1,1,450.0,225,0.0


## Resampling to address the imbalance class
Because fraud labels are given to invoice level, need to keep records of same invoice together. 

In [123]:
df_fraud = df_enc[df_enc['fraud'] == 1]
df_non_fraud = df_enc[df_enc['fraud'] == 0]

In [126]:
df_fraud.tail()

Unnamed: 0,distributor,sales,branch,inv_type,invoice_no,product_no,prod_cla,qty,total_amt,sale_price,ship_qty,cust_type,return,discount_app,list_price,inv_ship_days,fraud
209638,542,49,8,2,28150,52,1,1,110.94,100.9554,1,0,0,1,125.36,2,1.0
209639,542,49,8,2,28150,62,1,1,416.01,378.5691,1,0,0,1,470.09,2,1.0
209640,542,49,8,2,28150,484,1,1,4650.04,4231.5364,1,0,0,1,5254.55,2,1.0
209641,542,49,8,2,28150,70,4,2,820.84,373.4822,2,0,0,1,463.77,2,1.0
209642,542,49,8,2,28150,1076,4,2,6931.38,3153.7779,2,0,0,1,3916.23,2,1.0


In [137]:
sample_inv = df_non_fraud['invoice_no'].unique()

In [139]:
len(sample_inv)

30710