In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy import sparse
from geopy.distance import geodesic
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score ,accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM

In [None]:
train_data = pd.read_csv("fraudTrain.csv")
test_data = pd.read_csv("fraudTest.csv")

# Handle missing values in training data (if any)
train_data = train_data.dropna()

# Handle missing values in test data (if any)
test_data = test_data.dropna()

cat_features = ['merchant', 'category', 'gender', 'city', 'state', 'job']

# One-hot encode categorical features for training data
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_train_data = encoder.fit_transform(train_data[cat_features])

# One-hot encode categorical features for test data
encoded_test_data = encoder.transform(test_data[cat_features])

num_features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']



In [None]:
# Combine encoded categorical features with numerical features for training data
num_features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']
transaction_frequency = train_data.groupby('cc_num').size()
train_data['transaction_frequency'] = train_data['cc_num'].map(transaction_frequency)
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])
train_data = train_data.sort_values(by=['cc_num', 'trans_date_trans_time'])
train_data['time_since_last_transaction'] = train_data.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds().fillna(0)
train_data['distance_customer_merchant'] = train_data.apply(lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).miles, axis=1)
combined_train_data = sparse.hstack([encoded_train_data, train_data[num_features]])

In [None]:
transaction_frequency_test = test_data.groupby('cc_num').size()
test_data['transaction_frequency'] = test_data['cc_num'].map(transaction_frequency_test)
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])
test_data = test_data.sort_values(by=['cc_num', 'trans_date_trans_time'])
test_data['time_since_last_transaction'] = test_data.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds().fillna(0)
test_data['distance_customer_merchant'] = test_data.apply(lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).miles, axis=1)
combined_test_data = sparse.hstack([encoded_test_data, test_data[num_features]])

In [None]:

# Resample the training data using SMOTE
smote = SMOTE(random_state=42)
X_resampled_train, y_resampled_train = smote.fit_resample(combined_train_data, train_data['is_fraud'])

# Resample the test data using SMOTE
X_resampled_test, y_resampled_test = smote.fit_resample(combined_test_data, test_data['is_fraud'])

# Standardize numerical features for training data
scaler_train = StandardScaler(with_mean=False)
scaled_train_data = scaler_train.fit_transform(X_resampled_train)

# Standardize numerical features for test data
scaler_test = StandardScaler(with_mean=False)
scaled_test_data = scaler_test.fit_transform(X_resampled_test)

In [None]:
ensemble_models = {
     "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "SupportVectorMachine": SVC(random_state=42)
}

In [None]:
for name, model in ensemble_models.items():
    # Fit the model
    model.fit(X_resampled_train, y_resampled_train)

In [None]:
# Dictionary to store evaluation metrics for each model
evaluation_metrics = {}

# Evaluate each model
for name, model in ensemble_models.items():
    # Make predictions on the test data
    y_pred = model.predict(X_resampled_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_resampled_test, y_pred)
    precision = precision_score(y_resampled_test, y_pred)
    recall = recall_score(y_resampled_test, y_pred)
    f1 = f1_score(y_resampled_test, y_pred)

    # Store evaluation metrics in the dictionary
    evaluation_metrics[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Print evaluation metrics for each model
for name, metrics in evaluation_metrics.items():
    print(f"Model: {name}")
    print(f"Accuracy: {metrics['Accuracy']}")
    print(f"Precision: {metrics['Precision']}")
    print(f"Recall: {metrics['Recall']}")
    print(f"F1 Score: {metrics['F1 Score']}")
    print()

Model: LogisticRegression
Accuracy: 0.5
Precision: 0.5
Recall: 1.0
F1 Score: 0.6666666666666666

Model: DecisionTree
Accuracy: 0.7518668726664092
Precision: 0.9860852279786309
Recall: 0.5109437363203296
F1 Score: 0.6731119874485859

Model: RandomForest
Accuracy: 0.6371185786017768
Precision: 1.0
Recall: 0.2742371572035535
F1 Score: 0.4304334646862686

Model: GradientBoosting
Accuracy: 0.9369125788592765
Precision: 0.9785643773797772
Recall: 0.8933951332560834
F1 Score: 0.9340422667923005

Model: AdaBoost
Accuracy: 0.8623986095017381
Precision: 0.945054945054945
Recall: 0.7695377880777649
F1 Score: 0.8483128126885001

Model: SupportVectorMachine
Accuracy: 0.5
Precision: 0.5
Recall: 1.0
F1 Score: 0.6666666666666666



In [None]:
# Example of CNN
model_cnn = Sequential()
model_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_resampled_train.shape[1], 1)))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(50, activation='relu'))
model_cnn.add(Dense(1, activation='sigmoid'))

# Example of RNN (LSTM)
model_rnn = Sequential()
model_rnn.add(LSTM(50, input_shape=(X_resampled_train.shape[1], 1)))
model_rnn.add(Dense(1, activation='sigmoid'))

# Compile and fit the CNN model
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Reshape the data for CNN
if sparse.issparse(X_resampled_train):
    X_train_reshaped_cnn = X_resampled_train.toarray().reshape((X_resampled_train.shape[0], X_resampled_train.shape[1], 1))
else:
    X_train_reshaped_cnn = X_resampled_train.reshape((X_resampled_train.shape[0], X_resampled_train.shape[1], 1))

model_cnn.fit(X_train_reshaped_cnn, y_resampled_train, epochs=10, batch_size=32)

# Compile and fit the RNN model
model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Reshape the data for RNN
if sparse.issparse(X_resampled_train):
    X_train_reshaped_rnn = X_resampled_train.toarray().reshape((X_resampled_train.shape[0], X_resampled_train.shape[1], 1))
else:
    X_train_reshaped_rnn = X_resampled_train.reshape((X_resampled_train.shape[0], X_resampled_train.shape[1], 1))

model_rnn.fit(X_train_reshaped_rnn, y_resampled_train, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ae12821ad40>

In [37]:
# Convert sparse matrix to dense array
X_resampled_test_dense = X_resampled_test.toarray()

# Reshape the test data for CNN & RNN
X_resampled_test_reshaped = X_resampled_test_dense.reshape((X_resampled_test_dense.shape[0], X_resampled_test_dense.shape[1], 1))

# Predict using the CNN model
y_pred_cnn = model_cnn.predict(X_resampled_test_reshaped)

# Convert probabilities to binary predictions (0 or 1)
y_pred_cnn_binary = (y_pred_cnn > 0.5).astype(int)

# Calculate metrics for CNN model
accuracy_cnn = accuracy_score(y_resampled_test, y_pred_cnn_binary)
precision_cnn = precision_score(y_resampled_test, y_pred_cnn_binary)
recall_cnn = recall_score(y_resampled_test, y_pred_cnn_binary)
f1_cnn = f1_score(y_resampled_test, y_pred_cnn_binary)

# Predict using the RNN model
y_pred_rnn = model_rnn.predict(X_resampled_test_reshaped)

# Convert probabilities to binary predictions (0 or 1)
y_pred_rnn_binary = (y_pred_rnn > 0.5).astype(int)

# Calculate metrics for RNN model
accuracy_rnn = accuracy_score(y_resampled_test, y_pred_rnn_binary)
precision_rnn = precision_score(y_resampled_test, y_pred_rnn_binary)
recall_rnn = recall_score(y_resampled_test, y_pred_rnn_binary)
f1_rnn = f1_score(y_resampled_test, y_pred_rnn_binary)

# Print metrics for CNN model
print("CNN Model Metrics:")
print("Accuracy:", accuracy_cnn)

# Print metrics for RNN model
print("\nRNN Model Metrics:")
print("Accuracy:", accuracy_rnn)




  _warn_prf(average, modifier, msg_start, len(result))


CNN Model Metrics:
Accuracy: 0.5

RNN Model Metrics:
Accuracy: 0.8759173426033218
