In [28]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import joblib
import random
from sklearn.exceptions import NotFittedError


In [29]:
# Load your dataset
data = pd.read_csv('synthetic.csv', low_memory=False)

# Handling missing values (if any)
data = data.dropna()



In [30]:
data

Unnamed: 0,transaction_amount,transaction_frequency,distance,account_age_days,transaction_recency,unusual_activity_flag,num_unique_devices,num_unique_locations,blacklist_whitelist_status,transaction_amount_deviation,credit_score,account_status,is_fraud,fraud_score
0,639.192309,28,3.615184,3200,94,1,4,6,blacklist,26.533051,383,inactive,0,0.799771
1,576.710111,38,277.190442,1271,113,1,6,7,blacklist,224.145688,625,inactive,0,0.085106
2,801.978725,21,142.865170,540,273,0,2,7,blacklist,335.395544,432,active,1,0.971147
3,38.896766,47,484.538909,407,71,1,7,7,whitelist,387.634134,665,active,1,0.216646
4,441.023938,26,535.372215,3526,53,0,9,9,blacklist,162.939910,540,active,1,0.440403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,847.855925,31,917.351723,1382,169,1,7,7,blacklist,60.612207,763,inactive,0,0.488629
4996,975.109237,13,214.668626,1512,110,0,2,1,blacklist,65.383360,816,inactive,1,0.854843
4997,148.035754,15,775.262085,2492,155,0,7,2,blacklist,450.907835,801,inactive,0,0.465546
4998,641.644769,49,827.132371,1897,135,1,2,9,whitelist,249.279502,500,active,1,0.264145


In [31]:


# List of essential features to retain
essential_features = [
    'transaction_amount',
    'transaction_frequency',
    'distance',
    'account_age_days',
    'transaction_recency',
    'unusual_activity_flag',
    'num_unique_devices',
    'num_unique_locations',
    'blacklist_whitelist_status',
    'transaction_amount_deviation',
    'credit_score',
    'account_status'
]


In [32]:
# Combine essential features and additional columns
columns_to_keep = essential_features + ['is_fraud']

# Ensure all specified columns are in the dataset
for column in columns_to_keep:
    if column not in data.columns:
        raise ValueError(f"Column '{column}' is missing from the dataset.")

# Drop all columns except the essential features
data_filtered = data[columns_to_keep]

# Define feature columns and target column
feature_columns = essential_features
target_column = 'is_fraud'

# Split the data into features (X) and target (y)
X = data_filtered[feature_columns]
y = data_filtered[target_column]


In [33]:
X

Unnamed: 0,transaction_amount,transaction_frequency,distance,account_age_days,transaction_recency,unusual_activity_flag,num_unique_devices,num_unique_locations,blacklist_whitelist_status,transaction_amount_deviation,credit_score,account_status
0,639.192309,28,3.615184,3200,94,1,4,6,blacklist,26.533051,383,inactive
1,576.710111,38,277.190442,1271,113,1,6,7,blacklist,224.145688,625,inactive
2,801.978725,21,142.865170,540,273,0,2,7,blacklist,335.395544,432,active
3,38.896766,47,484.538909,407,71,1,7,7,whitelist,387.634134,665,active
4,441.023938,26,535.372215,3526,53,0,9,9,blacklist,162.939910,540,active
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,847.855925,31,917.351723,1382,169,1,7,7,blacklist,60.612207,763,inactive
4996,975.109237,13,214.668626,1512,110,0,2,1,blacklist,65.383360,816,inactive
4997,148.035754,15,775.262085,2492,155,0,7,2,blacklist,450.907835,801,inactive
4998,641.644769,49,827.132371,1897,135,1,2,9,whitelist,249.279502,500,active


In [34]:
label_encoder = LabelEncoder()
X['blacklist_whitelist_status'] = label_encoder.fit_transform(X['blacklist_whitelist_status'])
X['account_status'] = label_encoder.fit_transform(X['account_status'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['blacklist_whitelist_status'] = label_encoder.fit_transform(X['blacklist_whitelist_status'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['account_status'] = label_encoder.fit_transform(X['account_status'])


In [35]:
X

Unnamed: 0,transaction_amount,transaction_frequency,distance,account_age_days,transaction_recency,unusual_activity_flag,num_unique_devices,num_unique_locations,blacklist_whitelist_status,transaction_amount_deviation,credit_score,account_status
0,639.192309,28,3.615184,3200,94,1,4,6,0,26.533051,383,1
1,576.710111,38,277.190442,1271,113,1,6,7,0,224.145688,625,1
2,801.978725,21,142.865170,540,273,0,2,7,0,335.395544,432,0
3,38.896766,47,484.538909,407,71,1,7,7,1,387.634134,665,0
4,441.023938,26,535.372215,3526,53,0,9,9,0,162.939910,540,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,847.855925,31,917.351723,1382,169,1,7,7,0,60.612207,763,1
4996,975.109237,13,214.668626,1512,110,0,2,1,0,65.383360,816,1
4997,148.035754,15,775.262085,2492,155,0,7,2,0,450.907835,801,1
4998,641.644769,49,827.132371,1897,135,1,2,9,1,249.279502,500,0


In [36]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Display the resampled data
print(pd.DataFrame(X_resampled, columns=feature_columns).head())
print(pd.DataFrame(y_resampled, columns=[target_column]).head())

# Display the resampled class distribution
print(pd.Series(y_resampled).value_counts())


   transaction_amount  transaction_frequency    distance  account_age_days  \
0          639.192309                     28    3.615184              3200   
1          576.710111                     38  277.190442              1271   
2          801.978725                     21  142.865170               540   
3           38.896766                     47  484.538909               407   
4          441.023938                     26  535.372215              3526   

   transaction_recency  unusual_activity_flag  num_unique_devices  \
0                   94                      1                   4   
1                  113                      1                   6   
2                  273                      0                   2   
3                   71                      1                   7   
4                   53                      0                   9   

   num_unique_locations  blacklist_whitelist_status  \
0                     6                           0   
1     

In [37]:

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=3)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [25]:
# print the values of the target column which have one of the values of the target column

In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   transaction_amount            5000 non-null   float64
 1   transaction_frequency         5000 non-null   int64  
 2   distance                      5000 non-null   float64
 3   account_age_days              5000 non-null   int64  
 4   transaction_recency           5000 non-null   int64  
 5   unusual_activity_flag         5000 non-null   int64  
 6   num_unique_devices            5000 non-null   int64  
 7   num_unique_locations          5000 non-null   int64  
 8   blacklist_whitelist_status    5000 non-null   int64  
 9   transaction_amount_deviation  5000 non-null   float64
 10  credit_score                  5000 non-null   int64  
 11  account_status                5000 non-null   int64  
dtypes: float64(3), int64(9)
memory usage: 468.9 KB


In [41]:
from sklearn.exceptions import NotFittedError
# Fit the scaler
scaler = StandardScaler()
scaler.fit(X)

def get_user_input():
    return {
        'amount': float(input("Enter transaction amount: ")),
        'balance': float(input("Enter current balance: ")),
        'credit_score': float(input("Enter credit score (300-850): "))
    }

def enrich_transaction_data(user_input):
    # Simulate additional data
    transaction = user_input.copy()
    transaction['oldbalanceOrg'] = transaction['balance']
    transaction['newbalanceOrig'] = transaction['balance'] - transaction['amount']
    transaction['oldbalanceDest'] = random.uniform(0, 10000)  # Simulated receiver's old balance
    transaction['newbalanceDest'] = transaction['oldbalanceDest'] + transaction['amount']
    transaction['transaction_frequency'] = random.randint(1, 50)  # Simulated frequency
    transaction['transaction_recency'] = random.randint(1, 365)  # Days since last transaction
    transaction['distance'] = random.uniform(0, 1000)  # Simulated distance
    transaction['unusual_activity_flag'] = 1 if random.random() < 0.1 else 0  # 10% chance of unusual activity
    transaction['num_unique_devices'] = random.randint(1, 10)  # Simulated number of unique devices
    transaction['num_unique_locations'] = random.randint(1, 10)  # Simulated number of unique locations
    transaction['blacklist_whitelist_status'] = random.choice([0, 1])  # 0: blacklist, 1: whitelist
    transaction['transaction_amount_deviation'] = random.uniform(0, 500)  # Simulated amount deviation
    transaction['account_status'] = random.choice([0, 1])  # 0: inactive, 1: active
    transaction['is_fraud'] = random.randint(0, 1)  # Simulated fraud status

    return transaction
def preprocess_transaction(transaction, scaler):
    features = [  'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest',
                'transaction_frequency', 'transaction_recency', 'distance', 'unusual_activity_flag',
                'num_unique_devices', 'num_unique_locations', 'blacklist_whitelist_status',
                'transaction_amount_deviation',  'account_status']

    # Ensure transaction dictionary has all required features
    assert all(feature in transaction for feature in features), "Missing features in the transaction data"

    transaction_values = np.array([transaction[feature] for feature in features]).reshape(1, -1)
    try:
        transaction_scaled = scaler.transform(transaction_values)
    except NotFittedError as e:
        print(f"Error: {e}")
        return None
    return transaction_scaled


# Example usage
if __name__ == "__main__":
    user_input = get_user_input()
    enriched_transaction = enrich_transaction_data(user_input)
    preprocessed_transaction = preprocess_transaction(enriched_transaction, scaler)
    if preprocessed_transaction is not None:
        print(preprocessed_transaction)

[[ 1.30219199e+02  7.13468364e+01  8.74757354e+00 -1.73895034e+00
  -8.77992559e-01  6.10423068e+02 -1.96676658e+00  7.84346595e-01
   1.09964713e+01 -1.76912411e+00 -5.83995561e-01 -9.66558833e-01]]




In [42]:
print(data['is_fraud'].value_counts())

is_fraud
0    2547
1    2453
Name: count, dtype: int64


In [44]:
# Save the scaler
joblib.dump(scaler, 'scaler1.pkl')


['scaler1.pkl']

In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Input

def create_model(input_shape):
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

In [46]:
# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

model = create_model(X_train_scaled.shape[1])
model.summary()



In [47]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

def train_model(X, y, model, n_splits=5, epochs=50, batch_size=32, class_weight=None):
    skf = StratifiedKFold(n_splits=n_splits)
    
    X = np.array(X)
    y = np.array(y)
    
    histories = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f'Training on fold {fold + 1}/{n_splits}')
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        callbacks = [
            EarlyStopping(patience=10, restore_best_weights=True),
            ReduceLROnPlateau(factor=0.5, patience=5, min_lr=0.00001)
        ]
        
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            class_weight=class_weight,
            callbacks=callbacks,
            verbose=0
        )
        
        histories.append(history)
    
    return histories


In [48]:
histories = train_model(X_train_scaled, y_train, model, class_weight=class_weights)


Training on fold 1/5
Training on fold 2/5
Training on fold 3/5
Training on fold 4/5
Training on fold 5/5


In [49]:

model.save('fraud_detection_model1.keras')


In [50]:
# Load the model
model = load_model('fraud_detection_model1.keras')
scaler = joblib.load('scaler1.pkl')


In [53]:
# Make prediction
prediction = model.predict(preprocessed_transaction)
fraud_score = prediction[0][0]
print(fraud_score)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
1.0


In [54]:
def generate_transaction_score(transaction_scaled):
    # Ensure transaction_scaled is in the correct shape for prediction
    transaction_scaled = np.array(transaction_scaled).reshape(1, -1)  # Ensure it’s a 2D array
    transaction_score = model.predict(transaction_scaled)[0][0]
    return transaction_score


def detect_fraud(transaction_score, threshold=0.3):
    return transaction_score >= threshold


In [58]:
def main():
    user_input = get_user_input()
    enriched_transaction = enrich_transaction_data(user_input)
    preprocessed_transaction = preprocess_transaction(enriched_transaction, scaler)

    if preprocessed_transaction is not None:
        transaction_score = generate_transaction_score(preprocessed_transaction)
        is_fraud = detect_fraud(transaction_score)

        print("Transaction Score:", transaction_score)
        print("Fraud Detected:", is_fraud)
    else:
        print("Error: Preprocessed transaction is None. Check feature consistency.")

if __name__ == "__main__":
    main()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Transaction Score: 1.0
Fraud Detected: True


