In [15]:
!pip install protobuf==3.20.3



In [16]:
# generate_data.py
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle
import os
from uuid import uuid4

DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)

NUM_RECORDS = 20000
N_FEATURES = 25
FRAUD_RATE = 0.03   # 3% fraud

np.random.seed(42)

# ---------------------------------------------
# 1. REALISTIC FEATURE DEFINITIONS
# ---------------------------------------------

def generate_normal_behavior(n):
    return {
        "time_since_last_login": np.random.gamma(3, 20, n),
        "failed_attempts": np.random.poisson(0.2, n),
        "session_length_sec": np.random.normal(300, 50, n),
        "ip_risk_score": np.random.normal(0.1, 0.05, n),
        "device_change_flag": np.random.binomial(1, 0.05, n),
        "browser_change_flag": np.random.binomial(1, 0.03, n),
        "click_rate": np.random.normal(2.0, 0.5, n),
        "scroll_depth": np.random.normal(0.6, 0.1, n),
        "typing_speed_wpm": np.random.normal(40, 5, n),
        "distance_from_last_location_km": np.random.exponential(2, n),
        "country_risk_score": np.random.normal(0.05, 0.02, n),
        "pressure_variation": np.random.normal(0.2, 0.05, n),
        "touch_speed": np.random.normal(0.3, 0.07, n),
        "mouse_travel_distance": np.random.normal(200, 50, n),
        "hour_of_day": np.random.randint(0, 24, n),
        "day_of_week": np.random.randint(0, 7, n),
        "cpu_usage_pct": np.random.normal(20, 5, n),
        "ram_usage_pct": np.random.normal(30, 5, n),
        "network_latency_ms": np.random.normal(50, 10, n),
        "num_active_sessions": np.random.poisson(1, n),
        "past_fraud_attempts": np.random.poisson(0.1, n),
        "velocity_login_per_hr": np.random.normal(3, 1, n),
        "avg_transaction_value": np.random.normal(1000, 200, n),
        "risk_from_history": np.random.normal(0.1, 0.03, n),
    }

def generate_fraud_behavior(n):
    return {
        "time_since_last_login": np.random.exponential(1, n),
        "failed_attempts": np.random.poisson(3, n),
        "session_length_sec": np.random.normal(30, 10, n),
        "ip_risk_score": np.random.normal(0.9, 0.1, n),
        "device_change_flag": np.random.binomial(1, 0.8, n),
        "browser_change_flag": np.random.binomial(1, 0.7, n),
        "click_rate": np.random.normal(10, 2, n),
        "scroll_depth": np.random.normal(0.2, 0.05, n),
        "typing_speed_wpm": np.random.normal(90, 20, n),
        "distance_from_last_location_km": np.random.exponential(200, n),
        "country_risk_score": np.random.normal(0.9, 0.05, n),
        "pressure_variation": np.random.normal(0.05, 0.02, n),
        "touch_speed": np.random.normal(0.8, 0.1, n),
        "mouse_travel_distance": np.random.normal(20, 10, n),
        "hour_of_day": np.random.randint(0, 24, n),
        "day_of_week": np.random.randint(0, 7, n),
        "cpu_usage_pct": np.random.normal(70, 15, n),
        "ram_usage_pct": np.random.normal(80, 10, n),
        "network_latency_ms": np.random.normal(300, 40, n),
        "num_active_sessions": np.random.poisson(5, n),
        "past_fraud_attempts": np.random.poisson(2, n),
        "velocity_login_per_hr": np.random.normal(20, 5, n),
        "avg_transaction_value": np.random.normal(5000, 1000, n),
        "risk_from_history": np.random.normal(0.8, 0.1, n),
    }

# ---------------------------------------------
# 2. CREATE DATASET
# ---------------------------------------------
num_fraud = int(NUM_RECORDS * FRAUD_RATE)
num_normal = NUM_RECORDS - num_fraud

normal_data = generate_normal_behavior(num_normal)
fraud_data = generate_fraud_behavior(num_fraud)

df_normal = pd.DataFrame(normal_data)
df_normal["is_fraud"] = 0

df_fraud = pd.DataFrame(fraud_data)
df_fraud["is_fraud"] = 1

df = pd.concat([df_normal, df_fraud], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

# ---------------------------------------------
# 3. ADD CUSTOMER_ID (CN101 onward) + SESSION_ID
# ---------------------------------------------
unique_users = 2000

# Create IDs CN101 → CN(100 + unique_users)
customer_ids = [f"CN{100 + i}" for i in range(1, unique_users + 1)]

df["user_id"] = np.random.choice(customer_ids, size=len(df))

# Fraud clustering: fraud users chosen from this ID list
fraud_users = np.random.choice(customer_ids, size=30)
df.loc[df["is_fraud"] == 1, "user_id"] = np.random.choice(fraud_users, size=num_fraud)

# Add session_id (UUID)
df["session_id"] = [str(uuid4()) for _ in range(len(df))]

# ---------------------------------------------
# 4. SCALING (EXCLUDE IDs + label)
# ---------------------------------------------
feature_cols = [c for c in df.columns if c not in ["is_fraud", "user_id", "session_id"]]

scaler = MinMaxScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])

# ---------------------------------------------
# 5. SAVE OUTPUTS
# ---------------------------------------------
df.to_csv(os.path.join(DATA_DIR, "full_dataset.csv"), index=False)

with open(os.path.join(DATA_DIR, "scaler_params.pkl"), "wb") as f:
    pickle.dump({
        "scaler": scaler,
        "feature_order": feature_cols
    }, f)

print("Generated behavioural dataset (CN101 user IDs + sessions)!")


Generated behavioural dataset (CN101 user IDs + sessions)!


In [18]:
# train_and_deploy_8step.py
import os
import shutil
import json
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Sequential

# ---------------- CONFIG ----------------
DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)

SEQUENCE_LENGTH = 8   # <-- reduced from 64
TIME_STEPS = SEQUENCE_LENGTH
SCALER_PARAMS_FILE = os.path.join(DATA_DIR, "scaler_params.pkl")
MODEL_PATH_DIR = os.path.join(DATA_DIR, "lstm_autoencoder_savedmodel")
THRESHOLD_PATH = os.path.join(DATA_DIR, "threshold.json")
FULL_DATA_CSV = os.path.join(DATA_DIR, "full_dataset.csv")

# ---------------- LOAD DATA & ARTIFACTS ----------------
print("--- Loading data ---")
if not os.path.exists(FULL_DATA_CSV):
    raise FileNotFoundError(f"{FULL_DATA_CSV} not found. Run generate_data.py first.")

df = pd.read_csv(FULL_DATA_CSV)
non_feature_cols = {"user_id", "session_id", "is_fraud", "event_time", "timestamp"}
feature_cols = [c for c in df.columns if c not in non_feature_cols]

if "is_fraud" not in df.columns:
    raise ValueError("full_dataset.csv must contain an 'is_fraud' column")

N_FEATURES = len(feature_cols)
print(f"Detected {N_FEATURES} feature columns (using these):\n{feature_cols}")

X_full = df[feature_cols].values
y_full = df["is_fraud"].astype(int).values

# ---------------- CREATE SEQUENCES ----------------
def create_sequences(X, y, seq_length=TIME_STEPS):
    seqs = []
    labels = []
    n = len(X)
    if n < seq_length:
        raise ValueError(f"Not enough rows to create a single sequence: {n} rows < seq_length {seq_length}")
    for i in range(n - seq_length + 1):
        seqs.append(X[i:i + seq_length])
        labels.append(y[i + seq_length - 1])
    return np.array(seqs), np.array(labels)

X_seq, y_seq = create_sequences(X_full, y_full, SEQUENCE_LENGTH)
X_train = X_seq[y_seq == 0]  # train only on normal sequences

print(f"Sequences: X_seq.shape={X_seq.shape}, X_train.shape={X_train.shape}, y_seq distribution: {np.bincount(y_seq)}")

# ---------------- BUILD MODEL ----------------
print("\n--- Building LSTM autoencoder ---")
model = Sequential([
    tf.keras.Input(shape=(TIME_STEPS, N_FEATURES)),
    layers.LSTM(128, activation="tanh", return_sequences=False),
    layers.RepeatVector(TIME_STEPS),
    layers.LSTM(128, activation="tanh", return_sequences=True),
    layers.TimeDistributed(layers.Dense(N_FEATURES))  # linear output for scaled inputs
])

model.compile(optimizer="adam", loss="mse")
model.summary()

# ---------------- TRAIN ----------------
EPOCHS = 20
BATCH_SIZE = 128
print(f"\n--- Training ({EPOCHS} epochs, batch_size={BATCH_SIZE}) ---")

history = model.fit(
    X_train, X_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,
    shuffle=True,
    verbose=1
)

# ---------------- THRESHOLD ----------------
print("\n--- Calculating anomaly threshold ---")
recons = model.predict(X_seq)
mse_array = ((X_seq - recons) ** 2).mean(axis=(1, 2))
threshold = float(np.percentile(mse_array[y_seq == 0], 99.5))
print(f"Calculated threshold (99.5 percentile normal): {threshold:.8f}")




--- Loading data ---
Detected 24 feature columns (using these):
['time_since_last_login', 'failed_attempts', 'session_length_sec', 'ip_risk_score', 'device_change_flag', 'browser_change_flag', 'click_rate', 'scroll_depth', 'typing_speed_wpm', 'distance_from_last_location_km', 'country_risk_score', 'pressure_variation', 'touch_speed', 'mouse_travel_distance', 'hour_of_day', 'day_of_week', 'cpu_usage_pct', 'ram_usage_pct', 'network_latency_ms', 'num_active_sessions', 'past_fraud_attempts', 'velocity_login_per_hr', 'avg_transaction_value', 'risk_from_history']
Sequences: X_seq.shape=(19993, 8, 24), X_train.shape=(19394, 8, 24), y_seq distribution: [19394   599]

--- Building LSTM autoencoder ---



--- Training (20 epochs, batch_size=128) ---
Epoch 1/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 46ms/step - loss: 0.0347 - val_loss: 0.0202
Epoch 2/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 0.0193 - val_loss: 0.0190
Epoch 3/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 0.0181 - val_loss: 0.0171
Epoch 4/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - loss: 0.0163 - val_loss: 0.0161
Epoch 5/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 0.0148 - val_loss: 0.0139
Epoch 6/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 0.0136 - val_loss: 0.0134
Epoch 7/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 0.0130 - val_loss: 0.0133
Epoch 8/20
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 0.0126 - va

In [19]:
# ---------------- EXPORT MODEL & SAVE ARTIFACTS ----------------
print("\n--- Saving artifacts ---")
# remove old export if exists
if os.path.exists(MODEL_PATH_DIR):
    shutil.rmtree(MODEL_PATH_DIR)

# Keras 3 SavedModel export
model.export(MODEL_PATH_DIR)

# Save threshold
with open(THRESHOLD_PATH, "w") as f:
    json.dump({"threshold": threshold}, f)

# If scaler exists in file (from generate_data.py), preserve it; else make simple Min/Max from data
if os.path.exists(SCALER_PARAMS_FILE):
    with open(SCALER_PARAMS_FILE, "rb") as f:
        scaler_blob = pickle.load(f)
    # ensure feature_order saved matches detected feature_cols
    scaler_blob["feature_order"] = feature_cols
    with open(SCALER_PARAMS_FILE, "wb") as f:
        pickle.dump(scaler_blob, f)
    print(f"Updated existing scaler artifact at {SCALER_PARAMS_FILE}")
else:
    # compute min/max from training data (useful fallback for inference)
    data_min = np.min(X_full, axis=0).tolist()
    data_max = np.max(X_full, axis=0).tolist()
    scaler_blob = {"feature_order": feature_cols, "min": data_min, "max": data_max}
    with open(SCALER_PARAMS_FILE, "wb") as f:
        pickle.dump(scaler_blob, f)
    print(f"Saved fallback scaler params at {SCALER_PARAMS_FILE}")

print("\n✅ Export complete.")
print(f"- Model saved to: {MODEL_PATH_DIR}")
print(f"- Threshold saved to: {THRESHOLD_PATH}")
print(f"- Scaler params saved to: {SCALER_PARAMS_FILE}")



--- Saving artifacts ---
Saved artifact at 'data/lstm_autoencoder_savedmodel'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 8, 24), dtype=tf.float32, name='keras_tensor_10')
Output Type:
  TensorSpec(shape=(None, 8, 24), dtype=tf.float32, name=None)
Captures:
  135969493803728: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135969493803152: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135969493804496: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135969493805456: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135969493806608: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135969493804112: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135969493803920: TensorSpec(shape=(), dtype=tf.resource, name=None)
  135969493806032: TensorSpec(shape=(), dtype=tf.resource, name=None)
Updated existing scaler artifact at data/scaler_params.pkl

✅ Export complete.
- Model saved to: data/lstm_autoen

In [20]:
!gsutil -m cp -r data/lstm_autoencoder_savedmodel gs://account_takeover_model/


Copying file://data/lstm_autoencoder_savedmodel/fingerprint.pb [Content-Type=application/octet-stream]...
/ [0/4 files][    0.0 B/  1.8 MiB]   0% Done                                    Copying file://data/lstm_autoencoder_savedmodel/saved_model.pb [Content-Type=application/octet-stream]...
/ [0/4 files][    0.0 B/  1.8 MiB]   0% Done                                    Copying file://data/lstm_autoencoder_savedmodel/variables/variables.data-00000-of-00001 [Content-Type=application/octet-stream]...
Copying file://data/lstm_autoencoder_savedmodel/variables/variables.index [Content-Type=application/octet-stream]...
/ [4/4 files][  1.8 MiB/  1.8 MiB] 100% Done                                    
Operation completed over 4 objects/1.8 MiB.                                      


In [21]:
from google.cloud import aiplatform

PROJECT_ID = "liquid-anchor-478906-e3"  # replace with your project
REGION = "us-central1"

aiplatform.init(project=PROJECT_ID, location=REGION)


In [22]:
gcs_model_path = "gs://account_takeover_model/lstm_autoencoder_savedmodel/"

model = aiplatform.Model.upload(
    display_name="hackai_ato_defence_model_v3",
    artifact_uri=gcs_model_path,  # must point to the SavedModel folder
    serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-11:latest"
)


In [23]:
endpoint = model.deploy(
    machine_type="n1-standard-4",
    min_replica_count=1,
    max_replica_count=1
)

In [None]:
from google.cloud import bigquery
import numpy as np
import pandas as pd
from uuid import uuid4

# ---------------- CONFIG ----------------
PROJECT = "liquid-anchor-478906-e3"
DATASET = "ato_historical_Data"
TABLE = "customer_footprint"
USER_ID = "CN1001"
HIST_LIMIT = 64
MIN_HISTORY = 20  # Minimum records for personal history

client = bigquery.Client(project=PROJECT)

# ---------------- 1. Pull last HIST_LIMIT behavioural records for the user ----------------
query_user = f"""
SELECT *
FROM `{PROJECT}.{DATASET}.{TABLE}`
WHERE user_id = '{USER_ID}'
ORDER BY day_of_week DESC, hour_of_day DESC
LIMIT {HIST_LIMIT}
"""
df_hist = client.query(query_user).to_dataframe()

# ---------------- 2. Identify feature columns ----------------
non_features = {"user_id", "session_id", "is_fraud"}
feature_cols = [c for c in df_hist.columns if c not in non_features]

# ---------------- 3. Decide whether to use user history or global stats ----------------
if df_hist.empty or len(df_hist) < MIN_HISTORY:
    print(f"Not enough history for {USER_ID} ({len(df_hist)} records). Using global stats.")

    query_global = f"""
    SELECT *
    FROM `{PROJECT}.{DATASET}.{TABLE}`
    LIMIT 5000
    """
    df_global = client.query(query_global).to_dataframe()
    X = df_global[feature_cols].astype(float).values
    user_mean = X.mean(axis=0)
    user_std = X.std(axis=0) + 1e-6
else:
    X = df_hist[feature_cols].astype(float).values
    user_mean = X.mean(axis=0)
    user_std = X.std(axis=0) + 1e-6

# ---------------- 4. Create anomalous (fraudulent) behaviour vector ----------------
deviation_factor = np.random.uniform(3, 8)  # strong fraud deviation
fraud_vector = user_mean + deviation_factor * user_std

# Clip values to reasonable ranges
fraud_vector = np.clip(fraud_vector, -5, 10)

# ---------------- 5. Construct final fraud row ----------------
fraud_row = pd.DataFrame([{
    **{col: fraud_vector[i] for i, col in enumerate(feature_cols)},
    "user_id": USER_ID,
    "session_id": str(uuid4()),
    "is_fraud": 1
}])

print("\nGenerated Fraudulent Record:")
print(fraud_row)

# ---------------- 6. Optional: Insert into BigQuery ----------------
# client.insert_rows_json(f"{PROJECT}.{DATASET}.{TABLE}", fraud_row.to_dict(orient="records"))
# print("Fraud record inserted into BigQuery")


Not enough history for CN1001 (15 records). Using global stats.

Generated Fraudulent Record:
   time_since_last_login  failed_attempts  session_length_sec  ip_risk_score  \
0               0.603166         0.201341            0.998341       0.314821   

   device_change_flag  browser_change_flag  click_rate  scroll_depth  \
0            0.862705              0.71925     0.25356       0.96305   

   typing_speed_wpm  distance_from_last_location_km  ...  ram_usage_pct  \
0          0.296075                        0.006139  ...       0.403951   

   network_latency_ms  num_active_sessions  past_fraud_attempts  \
0            0.185664             0.000004             0.176167   

   velocity_login_per_hr  avg_transaction_value  risk_from_history  user_id  \
0               0.214789               0.197129           0.216808   CN1001   

                             session_id  is_fraud  
0  3d7ada03-66cf-455e-b425-99398576effb         1  

[1 rows x 27 columns]
