In [None]:
# Fetch the secret dynamically from Azure Key Vault via Databricks secret scope
service_credential =  dbutils.secrets.get(scope="dev_env", key="adls-secret") #scope name in DBX, secret name in keyvault

# Azure AD and Storage configurations
application_id = 'app_id'  
directory_id = 'directory_id'  

configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": f"{application_id}",
           "fs.azure.account.oauth2.client.secret": f"{service_credential}",
           "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{directory_id}/oauth2/token"}

In [None]:
try:
    dbutils.fs.mount(
    source = "abfss://gold-layer@saibdpadls.dfs.core.windows.net/",
    mount_point = "/mnt/saibdpadls/gold-layer",
    extra_configs = configs)

except:
    dbutils.fs.unmount("/mnt/saibdpadls/gold-layer")
    print("mount point already exists")
    dbutils.fs.mount(
    source = "abfss://gold-layer@saibdpadls.dfs.core.windows.net/",
    mount_point = "/mnt/saibdpadls/gold-layer",
    extra_configs = configs)
    print("mount point re-mounted")

/mnt/saibdpadls/gold-layer has been unmounted.
mount point already exists
mount point re-mounted


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import pickle

# Load dataset
df = pd.read_csv("/dbfs/mnt/saibdpadls/gold-layer/synthetic_transactions.csv") 

# Convert Timestamp to datetime and extract useful features
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
df["Hour"] = df["Timestamp"].dt.hour
df["Day"] = df["Timestamp"].dt.day
df["Month"] = df["Timestamp"].dt.month
df.drop(columns=["Timestamp", "TransactionID"], inplace=True)  # Drop original timestamp and ID

# Convert Duration to seconds
df["Duration"] = df["Duration"].apply(lambda x: sum(int(t) * sec for t, sec in zip(x.split(":"), [3600, 60, 1])))

# Encode categorical features
categorical_cols = ["Location", "CardType", "TransactionType", "ProductCategory"]
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Encode target variable (Benign = 0, Suspicious = 1)
label_encoder = LabelEncoder()
df_encoded["Label"] = label_encoder.fit_transform(df_encoded["Label"])

# Split dataset into training and testing sets
X = df_encoded.drop(columns=["Label"])
y = df_encoded["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
scaler_file = "/dbfs/mnt/saibdpadls/ml-model/scaler.pkl"
with open(scaler_file, "wb") as f:
    pickle.dump(scaler, f)

print(f"Scaler has been saved to {scaler_file}")

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Train and evaluate models
best_model = None
best_false_negatives = float("inf")

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Compute confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    # Select model with the lowest false negatives
    if fn < best_false_negatives:
        best_false_negatives = fn
        best_model = model

# Save the best model to a pickle file
best_model_file = "/dbfs/mnt/saibdpadls/ml-model/best_fraud_model.pkl"
with open(best_model_file, "wb") as f:
    pickle.dump(best_model, f)

print(f"The best model has been saved to {best_model_file}")


training_columns_file = "/dbfs/mnt/saibdpadls/ml-model/training_columns.pkl"
with open(training_columns_file, "wb") as f:
    pickle.dump(X.columns.tolist(), f)

print(f"Training columns have been saved to {training_columns_file}")


Scaler has been saved to /dbfs/mnt/saibdpadls/ml-model/scaler.pkl


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f84454580e0>
Traceback (most recent call last):
  File "/databricks/python/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/databricks/python/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/databricks/python/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/databricks/python/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'split'


The best model has been saved to /dbfs/mnt/saibdpadls/ml-model/best_fraud_model.pkl
Training columns have been saved to /dbfs/mnt/saibdpadls/ml-model/training_columns.pkl
