**Data Collection and Preprocessing**

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import sparse
from geopy.distance import geodesic
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [22]:
train_data = pd.read_csv("fraudTrain.csv")
test_data = pd.read_csv("fraudTest.csv")


train_data = train_data.dropna()

test_data = test_data.dropna()

cat_features = ['merchant', 'category', 'gender', 'city', 'state', 'job']

encoder = OneHotEncoder(handle_unknown='ignore')
encoded_train_data = encoder.fit_transform(train_data[cat_features])

encoded_test_data = encoder.transform(test_data[cat_features])

num_features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']



**Feature Engineering**

In [23]:
# Feature Engineering to add new columns for train data
num_features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']
transaction_frequency = train_data.groupby('cc_num').size()
train_data['transaction_frequency'] = train_data['cc_num'].map(transaction_frequency)
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])
train_data = train_data.sort_values(by=['cc_num', 'trans_date_trans_time'])
train_data['time_since_last_transaction'] = train_data.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds().fillna(0)
train_data['distance_customer_merchant'] = train_data.apply(lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).miles, axis=1)
combined_train_data = sparse.hstack([encoded_train_data, train_data[num_features]])

In [24]:
# Feature Engineering to add new columns for test data
transaction_frequency_test = test_data.groupby('cc_num').size()
test_data['transaction_frequency'] = test_data['cc_num'].map(transaction_frequency_test)
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])
test_data = test_data.sort_values(by=['cc_num', 'trans_date_trans_time'])
test_data['time_since_last_transaction'] = test_data.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds().fillna(0)
test_data['distance_customer_merchant'] = test_data.apply(lambda row: geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).miles, axis=1)
combined_test_data = sparse.hstack([encoded_test_data, test_data[num_features]])

**Synthetic Minority Over-sampling Technique (SMOTE) For resampling the data to balance the classes**

In [25]:
from imblearn.over_sampling import SMOTE

In [26]:

# Resampling of train Data
smote = SMOTE(random_state=42)
X_resampled_train, y_resampled_train = smote.fit_resample(combined_train_data, train_data['is_fraud'])

# Resampling of test Data
X_resampled_test, y_resampled_test = smote.fit_resample(combined_test_data, test_data['is_fraud'])


scaler_train = StandardScaler(with_mean=False)
scaled_train_data = scaler_train.fit_transform(X_resampled_train)


scaler_test = StandardScaler(with_mean=False)
scaled_test_data = scaler_test.fit_transform(X_resampled_test)

**Machine Learning Models**

In [27]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

**Ensemble Learning**

In [30]:
ensemble_models = {
     "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
}

In [33]:
for name, model in ensemble_models.items():
    model.fit(X_resampled_train, y_resampled_train)

In [34]:
from sklearn.metrics import f1_score, precision_score, recall_score ,accuracy_score

In [35]:
evaluation_metrics = {}

for name, model in ensemble_models.items():

    y_pred = model.predict(X_resampled_test)
    accuracy = accuracy_score(y_resampled_test, y_pred)
    precision = precision_score(y_resampled_test, y_pred)
    recall = recall_score(y_resampled_test, y_pred)
    f1 = f1_score(y_resampled_test, y_pred)

    evaluation_metrics[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

for name, metrics in evaluation_metrics.items():
    print(f"Model: {name}")
    print(f"Accuracy: {metrics['Accuracy']}")
    print(f"Precision: {metrics['Precision']}")
    print(f"Recall: {metrics['Recall']}")
    print(f"F1 Score: {metrics['F1 Score']}")
    print()

  _warn_prf(average, modifier, msg_start, len(result))


Model: LogisticRegression
Accuracy: 0.5
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Model: RandomForest
Accuracy: 0.9663882741053159
Precision: 1.0
Recall: 0.9327765482106318
F1 Score: 0.9652192324810627

Model: GradientBoosting
Accuracy: 0.9711988975654571
Precision: 0.9781271186440678
Recall: 0.9639537311563035
F1 Score: 0.9709887059120449



**Save the models**


In [None]:
import pickle
for name, model in ensemble_models.items():
    with open(f'{name}_model.pkl', 'wb') as file:
        pickle.dump(model, file)

**Deep learning models**

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM

**Convolutional Neural Networks (CNNs) or Recurrent Neural Networks (RNNs)**

In [14]:
# CNN
model_cnn = Sequential()
model_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_resampled_train.shape[1], 1)))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(50, activation='relu'))
model_cnn.add(Dense(1, activation='sigmoid'))

# Compile the CNN model
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Reshape the data for CNN
if sparse.issparse(X_resampled_train):
    X_train_reshaped_cnn = X_resampled_train.toarray().reshape((X_resampled_train.shape[0], X_resampled_train.shape[1], 1))
else:
    X_train_reshaped_cnn = X_resampled_train.reshape((X_resampled_train.shape[0], X_resampled_train.shape[1], 1))

model_cnn.fit(X_train_reshaped_cnn, y_resampled_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b7f21113b50>

In [15]:
# RNN
model_rnn = Sequential()
model_rnn.add(LSTM(50, input_shape=(X_resampled_train.shape[1], 1)))
model_rnn.add(Dense(1, activation='sigmoid'))

# Compile the RNN model
model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Reshape the data for RNN
if sparse.issparse(X_resampled_train):
    X_train_reshaped_rnn = X_resampled_train.toarray().reshape((X_resampled_train.shape[0], X_resampled_train.shape[1], 1))
else:
    X_train_reshaped_rnn = X_resampled_train.reshape((X_resampled_train.shape[0], X_resampled_train.shape[1], 1))

model_rnn.fit(X_train_reshaped_rnn, y_resampled_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b7ebc148b80>

In [16]:

X_resampled_test_dense = X_resampled_test.toarray()

# Reshape the test data for CNN & RNN
X_resampled_test_reshaped = X_resampled_test_dense.reshape((X_resampled_test_dense.shape[0], X_resampled_test_dense.shape[1], 1))

y_pred_cnn = model_cnn.predict(X_resampled_test_reshaped)

# Convert probabilities to binary predictions (0 or 1)
y_pred_cnn_binary = (y_pred_cnn > 0.5).astype(int)

accuracy_cnn = accuracy_score(y_resampled_test, y_pred_cnn_binary)
precision_cnn = precision_score(y_resampled_test, y_pred_cnn_binary)
recall_cnn = recall_score(y_resampled_test, y_pred_cnn_binary)
f1_cnn = f1_score(y_resampled_test, y_pred_cnn_binary)

print("CNN Model Metrics:")
print("Accuracy:", accuracy_cnn)




CNN Model Metrics:
Accuracy: 0.5


In [17]:
y_pred_rnn = model_rnn.predict(X_resampled_test_reshaped)

# Convert probabilities to binary predictions (0 or 1)
y_pred_rnn_binary = (y_pred_rnn > 0.5).astype(int)

accuracy_rnn = accuracy_score(y_resampled_test, y_pred_rnn_binary)
precision_rnn = precision_score(y_resampled_test, y_pred_rnn_binary)
recall_rnn = recall_score(y_resampled_test, y_pred_rnn_binary)
f1_rnn = f1_score(y_resampled_test, y_pred_rnn_binary)

print("\nRNN Model Metrics:")
print("Accuracy:", accuracy_rnn)


RNN Model Metrics:
Accuracy: 0.8351332819062313


**Save the models**


In [18]:
import pickle

In [19]:
with open('cnn_model.pkl', 'wb') as file:
    pickle.dump(model_cnn, file)

In [20]:
with open('rnn_model.pkl', 'wb') as file:
    pickle.dump(model_rnn, file)