# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns 

# Loading Dataset

In [3]:
data = pd.read_csv(r'C:\Users\hp\Documents\online payment fraud data.csv')

data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# Data Preprocessing

In [5]:
print(data.isnull().sum())

data['type'] = data['type'].astype('category').cat.codes

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']] = scaler.fit_transform(
    data[['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']])

print(data.head())


step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64
   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     3 -0.281560  C1231006815      -0.229810       -0.237622   
1     1     3 -0.294767  C1666544295      -0.281359       -0.285812   
2     1     4 -0.297555  C1305486145      -0.288654       -0.292442   
3     1     1 -0.297555   C840083671      -0.288654       -0.292442   
4     1     3 -0.278532  C2048537720      -0.274329       -0.282221   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155       -0.323814       -0.333411        0               0  
1  M2044282225       -0.323814       -0.333411        0               0  
2   C553264065       -0.323814       -0.333411        1               0  
3    C38997010       -0.317582       -0.3334

# Feature Engineering

In [7]:
data['errorOrig'] = data['newbalanceOrig'] + data['amount'] - data['oldbalanceOrg']
data['errorDest'] = data['newbalanceDest'] + data['amount'] - data['oldbalanceDest']

print(data.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     3 -0.281560  C1231006815      -0.229810       -0.237622   
1     1     3 -0.294767  C1666544295      -0.281359       -0.285812   
2     1     4 -0.297555  C1305486145      -0.288654       -0.292442   
3     1     1 -0.297555   C840083671      -0.288654       -0.292442   
4     1     3 -0.278532  C2048537720      -0.274329       -0.282221   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  \
0  M1979787155       -0.323814       -0.333411        0               0   
1  M2044282225       -0.323814       -0.333411        0               0   
2   C553264065       -0.323814       -0.333411        1               0   
3    C38997010       -0.317582       -0.333411        1               0   
4  M1230701703       -0.323814       -0.333411        0               0   

   errorOrig  errorDest  
0  -0.289372  -0.291157  
1  -0.299220  -0.304365  
2  -0.301343  -0.307152  
3  -0.301343  -0.3

# Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = data.drop(columns=['isFraud', 'isFlaggedFraud'])
y = data['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

# Model Evaluation

In [None]:
models = {'Random Forest': model, 'Logistic Regression': lr_model, 'SVM': svm_model, 'Gradient Boosting': gb_model}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"--- {name} ---")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Accuracy Score:")
    print(accuracy_score(y_test, y_pred))
    print("\n")

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 20, 30, None],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

# Model Validation

In [None]:
from sklearn.model_selection import cross_val_score

best_model = grid_search.best_estimator_
cv_scores = cross_val_score(best_model, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))


# Deploy Model

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    prediction = best_model.predict([list(data.values())])
    return jsonify({'prediction': int(prediction[0])})

if __name__ == '__main__':
    app.run(debug=True)
