In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("Fraud.csv")


In [3]:
df.shape

(6362620, 11)

In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [6]:
df['isFraud'].value_counts(normalize=True)


isFraud
0    0.998709
1    0.001291
Name: proportion, dtype: float64

In [7]:
pd.crosstab(df['type'], df['isFraud'], normalize='index')


isFraud,0,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
CASH_IN,1.0,0.0
CASH_OUT,0.99816,0.00184
DEBIT,1.0,0.0
PAYMENT,1.0,0.0
TRANSFER,0.992312,0.007688


In [8]:
df.groupby('isFraud')[['oldbalanceOrg', 'newbalanceOrig', 'amount']].mean()


Unnamed: 0_level_0,oldbalanceOrg,newbalanceOrig,amount
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,832828.7,855970.228109,178197.0
1,1649668.0,192392.631836,1467967.0


In [9]:
df['org_balance_error'] = df['oldbalanceOrg'] - df['amount'] - df['newbalanceOrig']
df['dest_balance_change'] = df['newbalanceDest'] - df['oldbalanceDest']

df['is_high_risk_type'] = df['type'].isin(['TRANSFER', 'CASH_OUT']).astype(int)


In [10]:
df[['org_balance_error', 'dest_balance_change', 'is_high_risk_type']].head()


Unnamed: 0,org_balance_error,dest_balance_change,is_high_risk_type
0,0.0,0.0,0
1,0.0,0.0,0
2,0.0,0.0,1
3,0.0,-21182.0,1
4,0.0,0.0,0


In [11]:
df_model = df.drop(columns=[
    'nameOrig',
    'nameDest',
    'isFlaggedFraud'
])

X = df_model.drop('isFraud', axis=1)
y = df_model['isFraud']

X.shape, y.shape


((6362620, 10), (6362620,))

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

X_train.shape, X_val.shape


((5090096, 10), (1272524, 10))

In [13]:
y_train.value_counts(normalize=True), y_val.value_counts(normalize=True)


(isFraud
 0    0.998709
 1    0.001291
 Name: proportion, dtype: float64,
 isFraud
 0    0.998709
 1    0.001291
 Name: proportion, dtype: float64)

In [14]:
cat_cols = ['type']
num_cols = [col for col in X.columns if col not in cat_cols]


In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)


In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    n_jobs=-1
)


In [17]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', model)
])


In [18]:
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [19]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = pipeline.predict(X_val)
y_proba = pipeline.predict_proba(X_val)[:, 1]


In [20]:
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.96      0.98   1270881
           1       0.03      0.94      0.05      1643

    accuracy                           0.96   1272524
   macro avg       0.51      0.95      0.52   1272524
weighted avg       1.00      0.96      0.98   1272524



In [21]:
print("ROC-AUC:", roc_auc_score(y_val, y_proba))


ROC-AUC: 0.9892857796386633


In [22]:
import numpy as np

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]

for t in thresholds:
    y_custom = (y_proba >= t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_val, y_custom, digits=3))



Threshold: 0.1
              precision    recall  f1-score   support

           0      1.000     0.784     0.879   1270881
           1      0.006     0.996     0.012      1643

    accuracy                          0.784   1272524
   macro avg      0.503     0.890     0.445   1272524
weighted avg      0.999     0.784     0.878   1272524


Threshold: 0.2
              precision    recall  f1-score   support

           0      1.000     0.868     0.930   1270881
           1      0.010     0.987     0.019      1643

    accuracy                          0.869   1272524
   macro avg      0.505     0.928     0.474   1272524
weighted avg      0.999     0.869     0.928   1272524


Threshold: 0.3
              precision    recall  f1-score   support

           0      1.000     0.915     0.955   1270881
           1      0.015     0.973     0.029      1643

    accuracy                          0.915   1272524
   macro avg      0.507     0.944     0.492   1272524
weighted avg      0.999   