In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_curve
from imblearn.over_sampling import SMOTE 
import warnings
import joblib
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('onlinepayment.csv')
print("Initial data shape:", df.shape)

Initial data shape: (6362620, 11)


In [3]:
print("\nFirst few rows:")
print(df.head())


First few rows:
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [4]:
print("\nDataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [5]:
obj = (df.dtypes == 'object')
obj_cols = list(obj[obj].index)
print('\nCategorical values: ', len(obj_cols))

int = (df.dtypes == 'int64')
int_cols = list(int[int].index)
print('Integer values: ', len(int_cols))

float = (df.dtypes == 'float')
float_cols = list(float[float].index)
print('Float values: ', len(float_cols))


Categorical values:  3
Integer values:  3
Float values:  5


In [6]:
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [7]:
type = df['type'].value_counts()
transactions = type.index
quantity = type.values

In [8]:
figure = px.pie(df,
               values=quantity,
               names=transactions, 
               hole=0.5,
               title='Distribution of transaction type')
figure.show()

In [9]:
print("\nFraud Distribution:")
print(df['isFraud'].value_counts())


Fraud Distribution:
isFraud
0    6354407
1       8213
Name: count, dtype: int64


In [10]:
print("\nCreating new features...")
df['transaction_diff'] = df['newbalanceOrig'] - df['oldbalanceOrg']
df['dest_diff'] = df['newbalanceDest'] - df['oldbalanceDest']
df['transaction_to_balance_ratio'] = df['amount'] / (df['oldbalanceOrg'] + 1)


Creating new features...


In [11]:
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

In [12]:
type_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("\nTransaction type mapping:")
print(type_mapping)


Transaction type mapping:
{'CASH_IN': 0, 'CASH_OUT': 1, 'DEBIT': 2, 'PAYMENT': 3, 'TRANSFER': 4}


In [13]:
df.fillna(0, inplace=True)

In [14]:
X = df[['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig']]
y = df['isFraud']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

In [18]:
X_train_balanced = X_train_balanced[:100000]  
y_train_balanced = y_train_balanced[:100000]

In [19]:
# Create dictionary to store models and their performances
models = {
	'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
	'Decision Tree': DecisionTreeClassifier(random_state=42),
	'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, n_jobs=-1)
}

# Create dictionary to store model performances
model_performances = {}

# Evaluate each model and store their performances
for name, model in models.items():
	print(f"\nEvaluating {name}...")
	model.fit(X_train_balanced, y_train_balanced)
	accuracy = model.score(X_test_scaled, y_test)
	
	# Use smaller chunks for cross-validation
	cv_scores = []
	chunk_size = len(X_train_balanced) // 5  # Split into 5 chunks
	
	for i in range(3):  # 3-fold CV
		start_idx = i * chunk_size
		end_idx = start_idx + chunk_size
		X_chunk = X_train_balanced[start_idx:end_idx]
		y_chunk = y_train_balanced[start_idx:end_idx]
		model.fit(X_chunk, y_chunk)
		cv_scores.append(model.score(X_test_scaled, y_test))
	
	cv_scores = np.array(cv_scores)
	model_performances[name] = accuracy
	print(f"{name} Accuracy: {accuracy:.4f}")
	print(f"{name} Cross-validation scores: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

best_model_name = max(model_performances, key=model_performances.get)
best_model = models[best_model_name]
print(f"\nBest performing model: {best_model_name}")


Evaluating Random Forest...
Random Forest Accuracy: 0.9992
Random Forest Cross-validation scores: 0.9991 (+/- 0.0001)

Evaluating Decision Tree...
Decision Tree Accuracy: 0.9990
Decision Tree Cross-validation scores: 0.9985 (+/- 0.0003)

Evaluating Logistic Regression...
Logistic Regression Accuracy: 0.9989
Logistic Regression Cross-validation scores: 0.9988 (+/- 0.0001)

Best performing model: Random Forest


In [20]:
if best_model_name == 'Decision Tree':
    param_grid = {
        'max_depth': [3, 5, 7, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [5, 10],
        'min_samples_split': [2, 5]
    }
else:  # Logistic Regression
    param_grid = {
        'C': [0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    }

In [21]:
print("\nTraining the model...")
# Train the model directly
best_model.fit(X_train_balanced, y_train_balanced)

# Evaluate the model
y_pred_final = best_model.predict(X_test_scaled)
print("\nFinal Model Performance:")
print(classification_report(y_test, y_pred_final))


Training the model...

Final Model Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270904
           1       0.79      0.46      0.58      1620

    accuracy                           1.00   1272524
   macro avg       0.89      0.73      0.79   1272524
weighted avg       1.00      1.00      1.00   1272524



In [22]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud', 'transaction_diff', 'dest_diff',
       'transaction_to_balance_ratio'],
      dtype='object')

In [23]:
joblib.dump(best_model_name, "models/online_payment_model.pkl")

['models/online_payment_model.pkl']

In [24]:
model = joblib.load("models/online_payment_model.pkl")

In [25]:
# Create empty DataFrame with required columns
data = pd.DataFrame(columns=['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig'])
transaction_type = ['CASH_OUT', 'TRANSFER', 'PAYMENT', 'DEBIT']
data['type'] = le.transform(transaction_type)
features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'type']

In [26]:
def predict_fraud(features):
    """
    Make fraud predictions on new data.
    features should be a numpy array with the same order as training features.
    """
    # Scale the features
    scaled_features = scaler.transform(features.reshape(1, -1))
    # Make prediction
    prediction = best_model.predict(scaled_features)
    probability = best_model.predict_proba(scaled_features)[0][1]
    return prediction[0], probability

In [27]:
# First encode the transaction type
cash_out = le.transform(['CASH_OUT'])[0] 
transfer = le.transform(['TRANSFER'])[0]
payment = le.transform(['PAYMENT'])[0]
debit = le.transform(['DEBIT'])[0]

In [28]:
test_features = np.array([[9000.60, 9000.60, 0.0, cash_out]])
prediction, probability = predict_fraud(test_features)
print("\nTest prediction:")
print(f"Prediction: {'SUSPICIOUS' if prediction == 1 else 'SAFE'}")
print(f"Probability of fraud: {probability:.4f}")


Test prediction:
Prediction: SAFE
Probability of fraud: 0.0200
