## AUTOENCODER-BASED ANOMALY DETECTION

### 1. Import Libraries

In [40]:
import numpy as np
np.float_ = np.float64


In [42]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns




In [46]:
! pip3 install tensorflow
import tensorflow as tf




AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.

### 2.Load Dataset

In [20]:
# Load the CSV file
df = pd.read_csv("SAML-D.csv")
df.head()

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
0,10:35:19,07/10/2022,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits
1,10:35:20,07/10/2022,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out
2,10:35:20,07/10/2022,287305149,4404767002,14328.44,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out
3,10:35:21,07/10/2022,5376652437,9600420220,11895.0,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In
4,10:35:21,07/10/2022,9614186178,3803336972,115.25,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits


### 3.Preprocessing the Data

In [21]:
# Drop missing values
df.dropna(inplace=True)

# Encode 'Payment_currency' if it exists
if 'Payment_currency' in df.columns:
    df['Payment_currency'] = LabelEncoder().fit_transform(df['Payment_currency'])

# Select numeric columns only
numeric_cols = df.select_dtypes(include=[np.number]).columns
df_numeric = df[numeric_cols]

# Scale numeric values
scaler = StandardScaler()
df_scaled_values = scaler.fit_transform(df_numeric)

# Create scaled DataFrame
df_scaled = pd.DataFrame(df_scaled_values, columns=numeric_cols)
df_scaled.head()




Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Is_laundering
0,1.286181,-0.785216,-0.296513,0.163899,-0.030208
1,-1.216551,1.168581,-0.109958,0.163899,-0.030208
2,-1.633406,-0.217866,0.229929,0.163899,-0.030208
3,0.127652,1.584591,0.130385,0.163899,-0.030208
4,1.593958,-0.426512,-0.351488,0.163899,-0.030208


### 4. Build Autoencoder Model

In [22]:
# Define model architecture
input_dim = df_scaled.shape[1]
input_layer = Input(shape=(input_dim,))

# Encoder
encoded = Dense(16, activation='relu')(input_layer)
encoded = Dense(8, activation='relu')(encoded)

# Decoder
decoded = Dense(16, activation='relu')(encoded)
decoded = Dense(input_dim, activation='linear')(decoded)

# Compile model
autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')



NameError: name 'Input' is not defined

### 5.Train Autoencoder

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

# Check GPU availability
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

# Set early stopping to prevent overfitting and reduce training time
early_stopping = EarlyStopping(
    monitor='val_loss',     # Watch validation loss
    patience=5,             # Stop after 5 epochs with no improvement
    restore_best_weights=True
)

# Train autoencoder with early stopping and visible training progress
history = autoencoder.fit(
    df_scaled, df_scaled,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    shuffle=True,
    verbose=1,              # Shows progress bar during training
    callbacks=[early_stopping]
)


### 6. Plot Training and Validation Loss

In [None]:
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Autoencoder Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

### 7. Compute Reconstruction Error

In [None]:
# Predict and calculate reconstruction error
reconstructions = autoencoder.predict(df_scaled)
mse = np.mean(np.square(df_scaled - reconstructions), axis=1)

# Add error to dataframe
df_scaled['reconstruction_error'] = mse



### 8. Detect Anomalies Based on Threshold

In [None]:
# Set threshold and detect anomalies
threshold = np.percentile(mse, 95)
df_scaled['anomaly'] = (df_scaled['reconstruction_error'] > threshold).astype(int)

print("Anomalies Detected:", df_scaled['anomaly'].sum())
df_scaled[['reconstruction_error', 'anomaly']].head()


## LIGHTGBM-BASED CLASSIFICATION MODEL

### 1. Import librarise

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import lightgbm as lgb

# Split features and labels
X = df_scaled.drop(columns=['reconstruction_error', 'anomaly'])
y = df_scaled['anomaly']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



### 2. Build & Train LightGBM Model

In [None]:
# Create LightGBM datasets
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

# Define parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1
}

# Train model
model = lgb.train(params,
                  lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  num_boost_round=100,
                  early_stopping_rounds=10,
                  verbose_eval=False)


### 3. Evaluate Model

In [None]:
# Predict and convert to binary
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

# Print classification report
print(classification_report(y_test, y_pred_binary))


### 4. Confusion Matrix

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_binary), annot=True, fmt='d', cmap='Blues')
plt.title('LightGBM Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()



### 5.  Feature Importance

In [None]:
lgb.plot_importance(model, max_num_features=10)
plt.title("LightGBM Feature Importance")
plt.show()
