In [2]:
# random forest
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


# Load the datasets
train_data = pd.read_csv('uncleaned_balanced_train_dataset.csv')
test_data = pd.read_csv('uncleaned_test_dataset.csv')

# Display the first few rows of the datasets
print("Training Data:")
print(train_data.head())
print("Test Data:")
print(test_data.head())

# Define feature columns and target variable
feature_cols = ['step', 'amount', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']
target_col = 'isFraud'

# Split the training and test data into features (X) and target (y)
X_train = train_data[feature_cols]
y_train = train_data[target_col]
X_test = test_data[feature_cols]
y_test = test_data[target_col]

# Display the first few rows of the feature sets
print("X_train:")
print(X_train.head())
print("y_train:")
print(y_train.head())

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='sqrt')

# Train the model
rf.fit(X_train, y_train)

from joblib import dump

# 保存模型
dump(rf, 'rf.joblib')
    


Training Data:
   step     amount  type_CASH_IN  type_CASH_OUT  type_DEBIT  type_PAYMENT  \
0   377     417.94             0              0           0             1   
1   349  266283.81             0              1           0             0   
2   329   14318.39             0              0           0             1   
3   329  575203.60             0              1           0             0   
4    46    5366.47             0              0           0             1   

   type_TRANSFER  isFraud  
0              0        0  
1              0        0  
2              0        0  
3              0        0  
4              0        0  
Test Data:
   step     amount  type_CASH_IN  type_CASH_OUT  type_DEBIT  type_PAYMENT  \
0   300  890577.21             0              0           0             0   
1   399   97734.24             1              0           0             0   
2   718    5907.41             0              0           0             1   
3   186  187696.30             0   

['rf.joblib']

In [4]:
# logistic regression
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Load the datasets
train_data = pd.read_csv('uncleaned_balanced_train_dataset.csv')
test_data = pd.read_csv('uncleaned_test_dataset.csv')

# Define feature columns and target variable
feature_cols = ['step', 'amount', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']
target_col = 'isFraud'

# Split the training and test data into features (X) and target (y)
X_train = train_data[feature_cols]
y_train = train_data[target_col]
X_test = test_data[feature_cols]
y_test = test_data[target_col]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the first few rows of the scaled feature sets
print("X_train_scaled:")
print(X_train_scaled[:5])
print("X_test_scaled:")
print(X_test_scaled[:5])

# Initialize the logistic regression model with predefined parameters
logreg = LogisticRegression(C=1, penalty='l2', solver='liblinear')

# Train the model
logreg.fit(X_train_scaled, y_train)

from joblib import dump

# 保存模型
dump(logreg, 'lr.joblib')

X_train_scaled:
[[ 0.59843379 -0.34568391 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [ 0.43542625 -0.16381817 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [ 0.31899229 -0.3361753  -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [ 0.31899229  0.04749864 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [-1.32854824 -0.34229887 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]]
X_test_scaled:
[[ 0.15016305  0.26323019 -0.44866192 -0.80042742 -0.06996882 -0.58155124
   2.1035477 ]
 [ 0.72651114 -0.27911462  2.22884971 -0.80042742 -0.06996882 -0.58155124
  -0.47538737]
 [ 2.58363279 -0.34192883 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [-0.51351052 -0.21757601 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [ 0.33063568 -0.28943539 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]]


['lr.joblib']

In [5]:
# svm
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Load the datasets
train_data = pd.read_csv('uncleaned_balanced_train_dataset.csv')
test_data = pd.read_csv('uncleaned_test_dataset.csv')

# Define feature columns and target variable
feature_cols = ['step', 'amount', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']
target_col = 'isFraud'

# Split the training and test data into features (X) and target (y)
X_train = train_data[feature_cols]
y_train = train_data[target_col]
X_test = test_data[feature_cols]
y_test = test_data[target_col]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the first few rows of the scaled feature sets
print("X_train_scaled:")
print(X_train_scaled[:5])
print("X_test_scaled:")
print(X_test_scaled[:5])

# Initialize the SVM classifier with a radial basis function (RBF) kernel
svm_model = SVC(kernel='rbf', probability=True, random_state=42)

# Train the model using the scaled training data
svm_model.fit(X_train_scaled, y_train)

from joblib import dump

# 保存模型
dump(svm_model, 'svm.joblib')

X_train_scaled:
[[ 0.59843379 -0.34568391 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [ 0.43542625 -0.16381817 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [ 0.31899229 -0.3361753  -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [ 0.31899229  0.04749864 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [-1.32854824 -0.34229887 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]]
X_test_scaled:
[[ 0.15016305  0.26323019 -0.44866192 -0.80042742 -0.06996882 -0.58155124
   2.1035477 ]
 [ 0.72651114 -0.27911462  2.22884971 -0.80042742 -0.06996882 -0.58155124
  -0.47538737]
 [ 2.58363279 -0.34192883 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [-0.51351052 -0.21757601 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [ 0.33063568 -0.28943539 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]]


['svm.joblib']

In [6]:
# gbm
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Load the datasets
train_data = pd.read_csv('uncleaned_balanced_train_dataset.csv')
test_data = pd.read_csv('uncleaned_test_dataset.csv')

# Define feature columns and target variable
feature_cols = ['step', 'amount', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']
target_col = 'isFraud'

# Split the training and test data into features (X) and target (y)
X_train = train_data[feature_cols]
y_train = train_data[target_col]
X_test = test_data[feature_cols]
y_test = test_data[target_col]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the first few rows of the scaled feature sets
print("X_train_scaled:")
print(X_train_scaled[:5])
print("X_test_scaled:")
print(X_test_scaled[:5])

# Initialize the Gradient Boosting classifier
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model using the scaled training data
gbm.fit(X_train_scaled, y_train)

from joblib import dump

# 保存模型
dump(gbm, 'gbm.joblib')

X_train_scaled:
[[ 0.59843379 -0.34568391 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [ 0.43542625 -0.16381817 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [ 0.31899229 -0.3361753  -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [ 0.31899229  0.04749864 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [-1.32854824 -0.34229887 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]]
X_test_scaled:
[[ 0.15016305  0.26323019 -0.44866192 -0.80042742 -0.06996882 -0.58155124
   2.1035477 ]
 [ 0.72651114 -0.27911462  2.22884971 -0.80042742 -0.06996882 -0.58155124
  -0.47538737]
 [ 2.58363279 -0.34192883 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [-0.51351052 -0.21757601 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [ 0.33063568 -0.28943539 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]]


['gbm.joblib']

In [7]:
# nn
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

# Load the datasets
train_data = pd.read_csv('uncleaned_balanced_train_dataset.csv')
test_data = pd.read_csv('uncleaned_test_dataset.csv')

# Define feature columns and target variable
feature_cols = ['step', 'amount', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']

target_col = 'isFraud'

# Split the training and test data into features (X) and target (y)
X_train = train_data[feature_cols]
y_train = train_data[target_col]
X_test = test_data[feature_cols]
y_test = test_data[target_col]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the first few rows of the scaled feature sets
print("X_train_scaled:")
print(X_train_scaled[:5])
print("X_test_scaled:")
print(X_test_scaled[:5])

# Initialize the ANN model
model = Sequential()

# Add input layer and first hidden layer
model.add(Dense(units=16, activation='relu', input_dim=X_train_scaled.shape[1]))

# Add second hidden layer
model.add(Dense(units=8, activation='relu'))

# Add output layer
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using the training data
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

from joblib import dump

# 保存模型
dump(model, 'ann.joblib')

X_train_scaled:
[[ 0.59843379 -0.34568391 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [ 0.43542625 -0.16381817 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [ 0.31899229 -0.3361753  -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [ 0.31899229  0.04749864 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [-1.32854824 -0.34229887 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]]
X_test_scaled:
[[ 0.15016305  0.26323019 -0.44866192 -0.80042742 -0.06996882 -0.58155124
   2.1035477 ]
 [ 0.72651114 -0.27911462  2.22884971 -0.80042742 -0.06996882 -0.58155124
  -0.47538737]
 [ 2.58363279 -0.34192883 -0.44866192 -0.80042742 -0.06996882  1.71953892
  -0.47538737]
 [-0.51351052 -0.21757601 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]
 [ 0.33063568 -0.28943539 -0.44866192  1.24933251 -0.06996882 -0.58155124
  -0.47538737]]
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9154 - loss: 0.3222 - val_accuracy: 0.2730 - val_loss: 1.5630
Epoch 2/50
[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9553 - loss: 0.1286 - val_accuracy: 0.3488 - val_loss: 1.4226
Epoch 3/50
[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 868us/step - accuracy: 0.9587 - loss: 0.1209 - val_accuracy: 0.3646 - val_loss: 1.3793
Epoch 4/50
[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9601 - loss: 0.1185 - val_accuracy: 0.4067 - val_loss: 1.3151
Epoch 5/50
[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 884us/step - accuracy: 0.9619 - loss: 0.1096 - val_accuracy: 0.3469 - val_loss: 1.4073
Epoch 6/50
[1m662/662[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 814us/step - accuracy: 0.9598 - loss: 0.1127 - val_accuracy: 0.3863 - val_loss: 1.3864
Epoch 7/50
[1m662/662[0m [32m━

['ann.joblib']