In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [45]:
df = pd.read_csv('PS_20174392719_1491204439457_log.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [46]:
df.shape

(6362620, 11)

In [47]:
df.isna().sum() + df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [48]:
df['type'].unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [49]:
df['type'].value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [50]:
df['type'] = df['type'].map({'PAYMENT' :2 , 'TRANSFER' : 4, 'CASH_OUT' : 1, 'DEBIT' : 5, 'CASH_IN' : 3})

In [51]:
df['type'].unique()

array([2, 4, 1, 5, 3])

In [52]:
df['type'].value_counts()

type
1    2237500
2    2151495
3    1399284
4     532909
5      41432
Name: count, dtype: int64

In [53]:
df['isFraud'] = df['isFraud'].map({0 : 'Not Fraud', 1 : 'Fraud'})

In [54]:
df['isFraud'] 

0          Not Fraud
1          Not Fraud
2              Fraud
3              Fraud
4          Not Fraud
             ...    
6362615        Fraud
6362616        Fraud
6362617        Fraud
6362618        Fraud
6362619        Fraud
Name: isFraud, Length: 6362620, dtype: object

In [55]:
X = df[['type','amount', 'oldbalanceOrg', 'newbalanceOrig']]
X

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig
0,2,9839.64,170136.00,160296.36
1,2,1864.28,21249.00,19384.72
2,4,181.00,181.00,0.00
3,1,181.00,181.00,0.00
4,2,11668.14,41554.00,29885.86
...,...,...,...,...
6362615,1,339682.13,339682.13,0.00
6362616,4,6311409.28,6311409.28,0.00
6362617,1,6311409.28,6311409.28,0.00
6362618,4,850002.52,850002.52,0.00


In [56]:
y = df.iloc[:, -2]
y

0          Not Fraud
1          Not Fraud
2              Fraud
3              Fraud
4          Not Fraud
             ...    
6362615        Fraud
6362616        Fraud
6362617        Fraud
6362618        Fraud
6362619        Fraud
Name: isFraud, Length: 6362620, dtype: object

In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [59]:
# # model = DecisionTreeClassifier()
# model = GradientBoostingClassifier()
# X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# model.fit(X_train, y_train)
# model.score(X_test, y_test)

In [60]:
# from sklearn import model_selection
# from sklearn.model_selection import cross_val_score, KFold
# Kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# cvs = cross_val_score(model, X, y, cv=Kfold, scoring='accuracy', n_jobs=-1)


In [62]:
# print(cvs)

In [63]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score



In [64]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns



In [65]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')




In [66]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [67]:
model = lgb.LGBMClassifier()



In [68]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])



In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [70]:
pipeline.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 5083503, number of negative: 6593
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025694 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 771
[LightGBM] [Info] Number of data points in the train set: 5090096, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.998705 -> initscore=6.647747
[LightGBM] [Info] Start training from score 6.647747


In [71]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)



In [72]:
print(f'Model accuracy: {accuracy:.4f}')

Model accuracy: 0.9978
