#Notebook Pengembangan Model Fraud Detection

Import Library yang diperlukan

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

## Data Preparation
Memuat data dari tiga file csv yang tersedia: sampling utama, diagnosa, prosedur

In [None]:
# Memuat data sampling utama
sampl = pd.read_csv('sampling_healtkathon2022.csv')
sampl.head()

Unnamed: 0,id,id_peserta,dati2,typefaskes,usia,jenkel,pisat,tgldatang,tglpulang,jenispel,...,biaya,jenispulang,cbg,kelasrawat,kdsa,kdsp,kdsr,kdsi,kdsd,label
0,165666,486,17,KL,48,P,1.0,2018-07-25T17:00:00.000Z,2018-07-25T17:00:00.000Z,2,...,184300.0,1.0,Q-5-42-0,3,,,,,,0
1,1010828,520,17,A,63,L,1.0,2019-05-27T17:00:00.000Z,2019-05-30T17:00:00.000Z,1,...,10628400.0,1.0,D-4-13-III,1,,,,,,0
2,166042,523,17,KL,53,P,1.0,2019-07-16T17:00:00.000Z,2019-07-16T17:00:00.000Z,2,...,187300.0,1.0,Q-5-44-0,3,,,,,,0
3,168937,549,17,KL,54,P,1.0,2019-10-17T17:00:00.000Z,2019-10-17T17:00:00.000Z,2,...,187300.0,1.0,Q-5-44-0,3,,,,,,0
4,1005899,549,17,A,53,P,1.0,2018-04-18T17:00:00.000Z,2018-04-18T17:00:00.000Z,2,...,381600.0,1.0,Q-5-44-0,3,,,,,,0


In [None]:
# Memuat data diagnosa
sampl_diag = pd.read_csv('sampling_healthkathon2022_diagnosa.csv')
sampl_diag.head()

Unnamed: 0,id,diag,levelid
0,6,O06.9,1
1,57,J02.9,1
2,91,R10.1,1
3,109,R18,1
4,111,N81.3,1


In [None]:
# Memuat data prosedur
sampl_proc = pd.read_csv('sampling_healthkathon2022_procedure.csv')
sampl_proc.head()

Unnamed: 0,id,proc
0,6,90.59
1,6,69.01
2,6,88.78
3,27,90.59
4,27,88.78


## Data Cleansing

### Data Sampling Utama

Mengecek nilai kosong pada data sample, kemudian mengisinya dengan yang paling sering muncul (untuk data objek) dan nilai rerata (untuk data numerik)

In [None]:
sampl.isna().sum()

id                   0
id_peserta           0
dati2                0
typefaskes           0
usia                 0
jenkel              49
pisat              190
tgldatang            0
tglpulang            0
jenispel             0
politujuan     4041455
diagfktp          2530
biaya            57815
jenispulang         39
cbg                  0
kelasrawat           0
kdsa            198670
kdsp            197007
kdsr            198459
kdsi            198463
kdsd            195181
label                0
dtype: int64

In [None]:
sampl["jenkel"] = sampl["jenkel"].fillna(sampl["jenkel"].mode()[0])
sampl["pisat"] = sampl["pisat"].fillna(sampl["pisat"].mode()[0])
sampl["politujuan"] = sampl["politujuan"].fillna(sampl["politujuan"].mode()[0])
sampl["diagfktp"] = sampl["diagfktp"].fillna(sampl["diagfktp"].mode()[0])
sampl['biaya'] = sampl['biaya'].fillna(sampl['biaya'].mean())
sampl["jenispulang"] = sampl["jenispulang"].fillna(sampl["jenispulang"].mode()[0])
sampl['kdsa'] = sampl['kdsa'].fillna('None')
sampl['kdsp'] = sampl['kdsp'].fillna('None')
sampl['kdsr'] = sampl['kdsr'].fillna('None')
sampl['kdsi'] = sampl['kdsi'].fillna('None')
sampl['kdsd'] = sampl['kdsd'].fillna('None')

In [None]:
# setelah data null diisi
sampl.isna().sum()

### Data Sampling Diagnosa dan Procedure

Menggabungkan data diagnosa dan prosedur ke dalam data utama.

In [None]:
sampl_diag = sampl_diag.drop_duplicates(subset=['id'])
df_merge = pd.merge(sampl,sampl_diag,on='id',how='left')

In [None]:
sampl_proc = sampl_proc.drop_duplicates(subset=['id'])
df_merge_final = pd.merge(df_merge,sampl_proc,on='id',how='left')

Mengisi nilai kosong yang terdapat pada data diagnosa dan prosedur dengan data yang paling sering muncul

In [None]:
df_merge_final["diag"] = df_merge_final["diag"].fillna(df_merge_final["diag"].mode()[0])
df_merge_final["levelid"] = df_merge_final["levelid"].fillna(df_merge_final["levelid"].mode()[0])
df_merge_final["proc"] = df_merge_final["proc"].fillna(df_merge_final["proc"].mode()[0])

In [None]:
df_merge_final.isna().sum()

id             0
id_peserta     0
dati2          0
typefaskes     0
usia           0
jenkel         0
pisat          0
tgldatang      0
tglpulang      0
jenispel       0
politujuan     0
diagfktp       0
biaya          0
jenispulang    0
cbg            0
kelasrawat     0
kdsa           0
kdsp           0
kdsr           0
kdsi           0
kdsd           0
label          0
diag           0
levelid        0
proc           0
dtype: int64

In [None]:
df_merge_final['proc'] = pd.to_numeric(df_merge_final['proc'], errors='coerce')

In [None]:
df_merge_final.isna().sum()

id             0
id_peserta     0
dati2          0
typefaskes     0
usia           0
jenkel         0
pisat          0
tgldatang      0
tglpulang      0
jenispel       0
politujuan     0
diagfktp       0
biaya          0
jenispulang    0
cbg            0
kelasrawat     0
kdsa           0
kdsp           0
kdsr           0
kdsi           0
kdsd           0
label          0
diag           0
levelid        0
proc           0
durasi         0
dtype: int64

In [None]:
labelencoder = LabelEncoder()
df_merge_final['jenkel'] = labelencoder.fit_transform(df_merge_final['jenkel'])
df_merge_final.head()

Unnamed: 0,id,id_peserta,dati2,typefaskes,usia,jenkel,pisat,tgldatang,tglpulang,jenispel,...,kdsa,kdsp,kdsr,kdsi,kdsd,label,diag,levelid,proc,durasi
0,165666,486,17,KL,48,1,1.0,2018-07-25 17:00:00+00:00,2018-07-25 17:00:00+00:00,2,...,,,,,,0,H60.9,1.0,90.59,0
1,1010828,520,17,A,63,0,1.0,2019-05-27 17:00:00+00:00,2019-05-30 17:00:00+00:00,1,...,,,,,,0,D64.9,1.0,99.04,3
2,166042,523,17,KL,53,1,1.0,2019-07-16 17:00:00+00:00,2019-07-16 17:00:00+00:00,2,...,,,,,,0,K30,2.0,90.59,0
3,168937,549,17,KL,54,1,1.0,2019-10-17 17:00:00+00:00,2019-10-17 17:00:00+00:00,2,...,,,,,,0,H52.0,2.0,90.59,0
4,1005899,549,17,A,53,1,1.0,2018-04-18 17:00:00+00:00,2018-04-18 17:00:00+00:00,2,...,,,,,,0,M54.59,1.0,90.59,0


In [None]:
df_merge_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11401882 entries, 0 to 11401881
Data columns (total 26 columns):
 #   Column       Dtype              
---  ------       -----              
 0   id           int64              
 1   id_peserta   int64              
 2   dati2        int64              
 3   typefaskes   object             
 4   usia         int64              
 5   jenkel       int32              
 6   pisat        float64            
 7   tgldatang    datetime64[ns, UTC]
 8   tglpulang    datetime64[ns, UTC]
 9   jenispel     int64              
 10  politujuan   object             
 11  diagfktp     object             
 12  biaya        float64            
 13  jenispulang  float64            
 14  cbg          object             
 15  kelasrawat   int64              
 16  kdsa         object             
 17  kdsp         object             
 18  kdsr         object             
 19  kdsi         object             
 20  kdsd         object             
 21  label 

## Exploratory Data Analysis

Data yang terdefinisi pada dataframe gabungan diantaranya:
*   `id`: Id kunjungan
*   `id_peserta`: Id Peserta
*   `dati2`: Lokasi Fasilitas Kesehatan (Kab/Kota)
*   `typefaskes`: Tipe Fasilitas Kesehatan
*   `usia`: Usia Peserta
*   `jenkel`: Jenis Kelamin Peserta
*   `pisat`: Hubungan Kepesertaan
*   `tgldatang`: Tanggal Kedatangan
*   `tglpulang`: Tanggal Kepulangan
*   `jenispel`:Jenis Pelayanan
*   `politujuan`: Poli Tujuan
*   `diagfktp`: Diagnosa dari FKTP
*   `biaya`: Biaya
*   `Jenispulang`: Kondisi Peserta saat Pulang
*   `cbg`: Kode Case Based Group
*   `kelasrawat`: Kelas Perawatan
*   `kdsa`: Kode Special Sub-Acute Group
*   `kdsp`: Kode Special Procedures
*   `kdsr`: Kode Special Prothesis
*   `kdsi`: Kode Special Investigations
*   `kdsd`: Kode Special Drugs
*   `label`: Label Potensi Efisiensi
*   `kddiag`: Kode Diagnosa FKRTL
*   `levelid`: Jenis Diagnosa (1: Primer, 2: Sekunder)
*   `kdproc`: Kode Prosedur


In [None]:
# Sebaran data label
ax = sns.countplot(y='label', data=df_merge_final);

total = df_merge_final['label'].size
for p in ax.patches:
        percentage = ' {:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

In [None]:
# heatmap dari feature
num_attributes = df_merge_final.select_dtypes(exclude='object')

corr = num_attributes.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

fig = plt.figure(figsize=(15,12))
warna = sns.color_palette("Blues")

with sns.axes_style("white"):
    ax = sns.heatmap(corr, annot=True, mask=mask, vmin=-1, center=0, vmax=1, square=True, cmap=warna)

## Feature Engineering

Mengolah beberapa fitur sehingga dihasilkan suatu informasi baru. Di bawah ini dihasilkan nilai `durasi kunjungan` berdasarkan data `tanggal datang` dan `tanggal pulang`

In [None]:
df_merge_final[['tgldatang','tglpulang']] = df_merge_final[['tgldatang','tglpulang']].apply(pd.to_datetime)
df_merge_final['durasi'] = (df_merge_final['tglpulang'] - df_merge_final['tgldatang']).dt.days
df_merge_final.head()

Unnamed: 0,id,id_peserta,dati2,typefaskes,usia,jenkel,pisat,tgldatang,tglpulang,jenispel,...,kdsa,kdsp,kdsr,kdsi,kdsd,label,diag,levelid,proc,durasi
0,165666,486,17,KL,48,P,1.0,2018-07-25 17:00:00+00:00,2018-07-25 17:00:00+00:00,2,...,,,,,,0,H60.9,1.0,90.59,0
1,1010828,520,17,A,63,L,1.0,2019-05-27 17:00:00+00:00,2019-05-30 17:00:00+00:00,1,...,,,,,,0,D64.9,1.0,99.04,3
2,166042,523,17,KL,53,P,1.0,2019-07-16 17:00:00+00:00,2019-07-16 17:00:00+00:00,2,...,,,,,,0,K30,2.0,90.59,0
3,168937,549,17,KL,54,P,1.0,2019-10-17 17:00:00+00:00,2019-10-17 17:00:00+00:00,2,...,,,,,,0,H52.0,2.0,90.59,0
4,1005899,549,17,A,53,P,1.0,2018-04-18 17:00:00+00:00,2018-04-18 17:00:00+00:00,2,...,,,,,,0,M54.59,1.0,90.59,0


Memilih feature yang digunakan, sebab tidak semua feature digunakan sebagai input dalam pengembangan model. Kolom yang tidak digunakan di drop

In [None]:
sampling = df_merge_final.drop(['id_peserta', 'typefaskes', 'tgldatang', 'tglpulang', 'politujuan', 'diagfktp', 'cbg', 'kdsa', 'kdsp', 'kdsr','kdsi', 'kdsd', 'diag'], axis=1)
sampling

Unnamed: 0,id,dati2,usia,jenkel,pisat,jenispel,biaya,jenispulang,kelasrawat,label,levelid,proc,durasi
0,165666,17,48,1,1.0,2,184300.0,1.0,3,0,1.0,90.59,0
1,1010828,17,63,0,1.0,1,10628400.0,1.0,1,0,1.0,99.04,3
2,166042,17,53,1,1.0,2,187300.0,1.0,3,0,2.0,90.59,0
3,168937,17,54,1,1.0,2,187300.0,1.0,3,0,2.0,90.59,0
4,1005899,17,53,1,1.0,2,381600.0,1.0,3,0,1.0,90.59,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11401877,9983563,301,0,1,4.0,1,3645100.0,1.0,3,0,2.0,99.18,4
11401878,11053870,233,0,1,4.0,1,12273200.0,1.0,2,0,2.0,99.15,4
11401879,7461049,118,0,0,4.0,1,4099000.0,1.0,3,0,2.0,99.18,7
11401880,1075162,101,0,0,5.0,1,3992400.0,1.0,3,0,1.0,93.90,2


## Modeling

Daftar feature yang digunakan tertera pada informasi dataframe sampling

In [None]:
sampling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11401882 entries, 0 to 11401881
Data columns (total 13 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   dati2        int64  
 2   usia         int64  
 3   jenkel       int32  
 4   pisat        float64
 5   jenispel     int64  
 6   biaya        float64
 7   jenispulang  float64
 8   kelasrawat   int64  
 9   label        int64  
 10  levelid      float64
 11  proc         float64
 12  durasi       int64  
dtypes: float64(5), int32(1), int64(7)
memory usage: 1.1 GB


Data dipisahkan menjadi feature vector dan label, yang nantinya akan digunakan dalam pemodelan supervised learning

In [None]:
x = sampling.drop(['id', 'label'], axis=1)
y = sampling['label']

In [None]:
x

Unnamed: 0,dati2,usia,jenkel,pisat,jenispel,biaya,jenispulang,kelasrawat,levelid,proc,durasi
0,17,48,1,1.0,2,184300.0,1.0,3,1.0,90.59,0
1,17,63,0,1.0,1,10628400.0,1.0,1,1.0,99.04,3
2,17,53,1,1.0,2,187300.0,1.0,3,2.0,90.59,0
3,17,54,1,1.0,2,187300.0,1.0,3,2.0,90.59,0
4,17,53,1,1.0,2,381600.0,1.0,3,1.0,90.59,0
...,...,...,...,...,...,...,...,...,...,...,...
11401877,301,0,1,4.0,1,3645100.0,1.0,3,2.0,99.18,4
11401878,233,0,1,4.0,1,12273200.0,1.0,2,2.0,99.15,4
11401879,118,0,0,4.0,1,4099000.0,1.0,3,2.0,99.18,7
11401880,101,0,0,5.0,1,3992400.0,1.0,3,1.0,93.90,2


In [None]:
# data di bagi ke dalam dataset train dan test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100) 

In [None]:
x_train

Unnamed: 0,dati2,usia,jenkel,pisat,jenispel,biaya,jenispulang,kelasrawat,levelid,proc,durasi
960222,45,36,0,1.0,2,183000.0,1.0,3,1.0,90.59,0
5175649,290,77,0,1.0,1,4788700.0,1.0,1,1.0,90.59,3
4330634,117,32,1,3.0,1,4925300.0,3.0,2,2.0,99.04,2
331156,112,73,1,1.0,2,112800.0,1.0,3,1.0,93.34,0
5339543,17,17,0,4.0,2,196300.0,1.0,3,1.0,90.59,0
...,...,...,...,...,...,...,...,...,...,...,...
4407216,55,46,1,3.0,2,213100.0,1.0,3,1.0,16.21,0
11349847,38,24,0,4.0,2,137700.0,1.0,3,1.0,88.23,0
8118119,168,45,0,1.0,1,1448400.0,1.0,3,1.0,90.59,1
4405059,17,63,0,1.0,2,381600.0,1.0,3,1.0,90.59,0


In [None]:
x_test

Unnamed: 0,dati2,usia,jenkel,pisat,jenispel,biaya,jenispulang,kelasrawat,levelid,proc,durasi
8119405,192,7,1,4.0,1,2586500.0,1.0,2,1.0,90.59,2
4064351,82,1,0,4.0,1,3685200.0,1.0,1,1.0,90.59,3
8550761,220,60,1,3.0,2,254200.0,1.0,3,1.0,90.59,0
1849795,134,41,1,3.0,2,182100.0,1.0,3,1.0,90.59,0
6143151,303,49,0,1.0,2,188200.0,1.0,3,1.0,90.59,0
...,...,...,...,...,...,...,...,...,...,...,...
11253785,91,80,0,2.0,2,192100.0,1.0,3,2.0,90.59,0
751949,470,58,0,1.0,2,275100.0,1.0,3,2.0,87.49,0
8252272,294,22,0,4.0,2,188200.0,1.0,3,1.0,90.59,0
11007362,17,31,0,5.0,1,6317400.0,1.0,3,1.0,94.19,47


### Model Random Forest

In [None]:
# Random forest
rf = RandomForestClassifier()
rf = rf.fit(x_train, y_train) #random forest classifier berdasarkan parameter x_train dan y_train

In [None]:
# save random forest model
filename = "rf_model.sav"
pickle.dump(rf, open(filename, 'wb'))

# # load model
rf = pickle.load(open(filename, 'rb'))

In [None]:
y_rf = rf.predict(x)

## Evaluasi
Mengevaluasi hasil prediksi terhadap label yang terdapat pada data asli

In [None]:
def print_eval_metrics(y, y_pred):
  acc_rf = accuracy_score(y, y_pred)
  pre_rf = precision_score(y, y_pred, average='binary')
  rec_rf = recall_score(y, y_pred, average='binary')
  cm=(confusion_matrix(y, y_pred))
  specifity = cm[0,0]/(cm[0,0]+cm[0,1])

  print('Accuracy with Rf : ',acc_rf)
  print('Precision with Rf : ',pre_rf)
  print('Recall with RF : ',rec_rf)
  print('Specifity with Rf : ',specifity)

  classification_report(y, y_pred)

print_eval_metrics(y, y_rf)

Accuracy with Rf :  0.9959767168262222
Precision with Rf :  0.947075500571852
Recall with RF :  0.7494916788302558
Specifity with Rf :  0.9994156510368659


In [None]:
# x_copy = pd.DataFrame(sampling['id'])
# x_copy['label'] = y_rf
# x_copy.to_csv('answer.csv', index=False, encoding='utf8')

In [None]:
# fix = pd.read_csv('answer.csv')
# fix