In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf

# Paths to the CSV files
csv_paths = [
    '/content/bank_data_1.csv',
    '/content/bank_data_2.csv',
    '/content/bank_data_3.csv',
    '/content/bank_data_4.csv'
]

# Load the datasets
dfs = [pd.read_csv(path) for path in csv_paths]

# Define a function for preprocessing
def preprocess_data(df):
    # Dropping irrelevant columns (like CHQ.NO. and the unnamed one)
    df = df.drop(columns=['CHQ.NO.', '.'])

    # Handle missing values: fill NaNs with 0 (assuming they represent no transaction)
    df = df.fillna(0)

    # Convert categorical columns to numeric (e.g., Account No, TRANSACTION DETAILS)
    le = LabelEncoder()
    df['Account No'] = le.fit_transform(df['Account No'])
    df['TRANSACTION DETAILS'] = le.fit_transform(df['TRANSACTION DETAILS'])

    # Convert dates to numeric (e.g., days since the earliest date)
    df['DATE'] = pd.to_datetime(df['DATE'])
    df['VALUE DATE'] = pd.to_datetime(df['VALUE DATE'])
    df['DATE'] = (df['DATE'] - df['DATE'].min()).dt.days
    df['VALUE DATE'] = (df['VALUE DATE'] - df['VALUE DATE'].min()).dt.days

    # Ensure all columns are float32
    float_columns = ['DATE', 'VALUE DATE', 'WITHDRAWAL AMT', 'DEPOSIT AMT', 'BALANCE AMT']
    df[float_columns] = df[float_columns].astype('float32')

    # Normalize numerical columns
    scaler = StandardScaler()
    df[float_columns] = scaler.fit_transform(df[float_columns])

    return df

# Preprocess each dataset
processed_dfs = [preprocess_data(df) for df in dfs]

# Define a simple feedforward neural network model
def create_model(input_shape):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='linear')
    ])

# Prepare data for training (features and target)
def prepare_features_and_target(df):
    X = df.drop(columns=['BALANCE AMT']).values.astype('float32')  # Features
    y = df['BALANCE AMT'].values.astype('float32')  # Target
    return X, y

# Create the model
input_shape = processed_dfs[0].drop(columns=['BALANCE AMT']).shape[1]
model = create_model(input_shape)

# Define loss function
loss_fn = tf.keras.losses.MeanSquaredError()

# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Training step function
@tf.function
def train_step(features, labels):
    with tf.GradientTape() as tape:
        predictions = model(features, training=True)
        loss = loss_fn(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    return loss, gradients

# Federated Learning simulation
num_rounds = 5  # Number of federated learning rounds
all_gradients = []

for round in range(num_rounds):
    print(f"Round {round + 1}/{num_rounds}")
    round_gradients = []

    # Iterate through each dataset (simulating different clients)
    for i, df in enumerate(processed_dfs):
        print(f"Training on dataset {i + 1}")

        # Split the data into train and validation sets
        train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

        # Prepare features and target
        X_train, y_train = prepare_features_and_target(train_data)
        X_val, y_val = prepare_features_and_target(val_data)

        # Convert to TensorFlow tensors
        train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)

        # Train the model and collect gradients
        epoch_gradients = []
        for epoch in range(5):  # 5 epochs per round
            for step, (x_batch, y_batch) in enumerate(train_dataset):
                loss, gradients = train_step(x_batch, y_batch)
                optimizer.apply_gradients(zip(gradients, model.trainable_variables))
                if step == 0:  # Only store gradients from the first batch for simplicity
                    epoch_gradients.append(gradients)

            # Validate
            val_loss = loss_fn(y_val, model(X_val))
            print(f"Epoch {epoch + 1}, Validation Loss: {val_loss:.4f}")

        round_gradients.append(epoch_gradients)

    all_gradients.append(round_gradients)
    print("----------------------")

# Convert gradients to numpy arrays for easier handling
numpy_gradients = [[[[g.numpy() for g in epoch_grad] for epoch_grad in client_grad] for client_grad in round_grad] for round_grad in all_gradients]

# Example: Print the shape of gradients for the first layer of the first epoch, first client, first round
print("Shape of gradients for first layer:", numpy_gradients[0][0][0][0].shape)

# Final evaluation on each dataset
for i, df in enumerate(processed_dfs):
    X, y = prepare_features_and_target(df)
    loss = loss_fn(y, model(X))
    print(f"Final loss on dataset {i + 1}: {loss:.4f}")

# Model summary
model.summary()

# Return the gradients for use in GAN
numpy_gradients

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Round 1/5
Training on dataset 1
Epoch 1, Validation Loss: 95.5821
Epoch 2, Validation Loss: 38.8670
Epoch 3, Validation Loss: 55.6756
Epoch 4, Validation Loss: 19.4040
Epoch 5, Validation Loss: 8.4297
Training on dataset 2
Epoch 1, Validation Loss: 10.6102
Epoch 2, Validation Loss: 6.4722
Epoch 3, Validation Loss: 1.7795
Epoch 4, Validation Loss: 2.0666
Epoch 5, Validation Loss: 1.5314
Training on dataset 3
Epoch 1, Validation Loss: 1.3185
Epoch 2, Validation Loss: 1.5813
Epoch 3, Validation Loss: 1.2607
Epoch 4, Validation Loss: 1.1431
Epoch 5, Validation Loss: 1.1101
Training on dataset 4
Epoch 1, Validation Loss: 1.0724
Epoch 2, Validation Loss: 1.0824
Epoch 3, Validation Loss: 1.0449
Epoch 4, Validation Loss: 1.0436
Epoch 5, Validation Loss: 1.0545
----------------------
Round 2/5
Training on dataset 1
Epoch 1, Validation Loss: 1.0774
Epoch 2, Validation Loss: 1.0607
Epoch 3, Validation Loss: 1.0446
Epoch 4, Validation Loss: 1.0488
Epoch 5, Validation Loss: 1.0542
Training on datas

[[[[array([[ 7.40692258e-01, -3.83418007e+01, -1.98233490e+01,
            -4.13811646e+01,  0.00000000e+00,  0.00000000e+00,
             4.96697937e+02,  0.00000000e+00,  1.01314691e+03,
            -5.41331196e+00, -1.46757294e+02,  9.55443497e+01,
             0.00000000e+00,  5.30924011e+02, -2.27218048e+02,
             1.16018311e+02,  3.19708771e+02,  6.63595533e+00,
            -3.71888641e+02,  2.65335480e+02,  4.65958496e+02,
             2.23411026e+01,  6.97988770e+02,  2.14809685e+01,
             0.00000000e+00, -3.02155933e+01,  4.50097542e+01,
            -4.46951675e+01,  1.92726379e+02, -3.63305481e+02,
             6.36986511e+02,  5.70401688e+01, -4.52034729e+02,
             0.00000000e+00, -2.41321144e+01,  1.80843948e+02,
            -2.18262463e+01,  4.41258240e+02,  7.02065430e+01,
             1.42697363e+01,  7.44228027e+02,  3.48668762e+02,
             1.46995468e+02, -6.84745073e-01,  7.88304090e+00,
             0.00000000e+00,  1.99041538e+01,  0.000000