In [None]:
pip install torch torch-geometric scikit-learn pandas networkx

Collecting torch-scatter
  Using cached torch_scatter-2.1.2.tar.gz (108 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-sparse
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-scatter, torch-sparse
[31mERROR: Operation cancelled by user[0m[31m
[0m  Building wheel for torch-scatter (setup.py) ... [?25l[?25h

In [None]:
pip install torch-geometric torch-scatter torch-sparse

Collecting torch-scatter
  Using cached torch_scatter-2.1.2.tar.gz (108 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-sparse
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-scatter, torch-sparse
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch-scatter: filename=torch_scatter-2.1.2-cp311-cp311-linux_x86_64.whl size=547368 sha256=7ca841951858fb1df5e592c05ccd68fb39896bafc0e2ce341888fd4723d5daa9
  Stored in directory: /root/.cache/pip/wheels/b8/d4/0e/a80af2465354ea7355a2c153b11af2da739cfcf08b6c0b28e2
  Building wheel for torch-sparse (setup.py) ... [?25l[?25hdone
  Created wheel for torch-sparse: filename=torch_sparse-0.6.18-cp311-cp311-linux_x86_64.whl size=1127937 sha256=598ce6300e795e9e617bc01b7193e098e36a926d1c2ab559e78a68749e7d368f
  Stored in directory: /root/.cache/pip/wheels/75/e2/1e/299c596063839303657c211f5

In [None]:
import pandas as pd
import torch
from torch_geometric.data import Data
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

# Load data
df = pd.read_excel("/content/drive/MyDrive/Malware/Output1.xlsx")

# Extract features and labels
features = df.iloc[:, 1:].values
filenames = df['Filename']

# Encode labels (e.g., 'Spyware-TIBS' -> 'Spyware')
labels = filenames.str.extract(r'(\w+)-')[0]
le = LabelEncoder()
y = le.fit_transform(labels)

# Scale features
scaler = StandardScaler()
features = scaler.fit_transform(features)

# Create edges based on similarity
sim_matrix = cosine_similarity(features)
k = 5  # k nearest neighbors

# Creating time windows of 10 rows each (85 windows for 851 rows)
window_size = 10
num_windows = len(df) // window_size  # Total windows = 851 / 10 = 85

# Create a writer object to write data to a single Excel file
with pd.ExcelWriter('/content/drive/MyDrive/Malware/all_windows_data.xlsx') as writer:
    # Store data for each time window in a list
    for window in range(num_windows):
        start_idx = window * window_size
        end_idx = (window + 1) * window_size

        # Extract features for the current window
        window_features = features[start_idx:end_idx]
        window_labels = y[start_idx:end_idx]

        # Create a similarity matrix for this window
        sim_matrix_window = cosine_similarity(window_features)

        # Generate edges for this window based on similarity
        edge_index_window = []
        for i in range(len(sim_matrix_window)):
            top_k = np.argsort(sim_matrix_window[i])[-(k+1):-1]  # Skip self-loop
            for j in top_k:
                edge_index_window.append([i, j])

        # Convert to tensor
        edge_index_window = torch.tensor(edge_index_window, dtype=torch.long).t().contiguous()
        x_window = torch.tensor(window_features, dtype=torch.float)
        y_window = torch.tensor(window_labels, dtype=torch.long)

        # Create PyG Data object for this window
        data_window = Data(x=x_window, edge_index=edge_index_window, y=y_window)

        # Convert x (features) to DataFrame and save to Excel (one sheet per window)
        df_x = pd.DataFrame(x_window.numpy())  # Convert tensor to numpy and then to DataFrame
        df_x.to_excel(writer, sheet_name=f'Window_{window}_Features', index=False)

        # Convert edge_index (edges) to DataFrame (source, target) and save to Excel
        df_edges = pd.DataFrame(edge_index_window.numpy().T, columns=["Source", "Target"])
        df_edges.to_excel(writer, sheet_name=f'Window_{window}_Edges', index=False)

        # Convert y (labels) to DataFrame and save to Excel
        df_y = pd.DataFrame(y_window.numpy(), columns=["Labels"])
        df_y.to_excel(writer, sheet_name=f'Window_{window}_Labels', index=False)

# The Excel file will now contain multiple sheets: one for each window's features, edges, and labels


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


 Step 1: Dynamic Graph Learning with GraphSAGE

In [11]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data

# Sample GraphSAGE Model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x


 Train over multiple windows

In [14]:
from sklearn.model_selection import train_test_split
import torch
from torch_geometric.nn import GraphSAGE  # Make sure GraphSAGE is imported
import torch.nn.functional as F

# Set device to CPU
device = torch.device("cpu")

# Set model parameters
in_channels = data_list[0].num_node_features
hidden_channels = 32
out_channels = len(set(y))  # number of unique labels

# Define the model
model = GraphSAGE(in_channels, hidden_channels, out_channels).to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

# Train model on each window
model.train()
for epoch in range(20):  # number of training epochs
    total_loss = 0
    for data in data_list:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = loss_fn(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 215.1897
Epoch 2, Loss: 397.1544
Epoch 3, Loss: 290.2010
Epoch 4, Loss: 195.0410
Epoch 5, Loss: 124.2890
Epoch 6, Loss: 75.4146
Epoch 7, Loss: 47.2189
Epoch 8, Loss: 34.1043
Epoch 9, Loss: 15.3015
Epoch 10, Loss: 3.6751
Epoch 11, Loss: 1.8512
Epoch 12, Loss: 1.5489
Epoch 13, Loss: 1.3789
Epoch 14, Loss: 1.2416
Epoch 15, Loss: 1.1266
Epoch 16, Loss: 1.0279
Epoch 17, Loss: 0.9422
Epoch 18, Loss: 0.8668
Epoch 19, Loss: 0.8000
Epoch 20, Loss: 0.7404


GraphSAGE is imported:

In [15]:
from torch_geometric.nn import GraphSAGE

Basic Accuracy

In [16]:
# Evaluate on all windows using CPU
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for data in data_list:
        data = data.to("cpu")  # Set device to CPU
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        correct += (pred == data.y).sum().item()
        total += data.num_nodes

print(f"Accuracy: {correct / total * 100:.2f}%")


Accuracy: 100.00%


Causal Sampling (Temporal Neighbor Selection)

Updated Training Loop with Causal Sampling:

In [17]:
from torch_geometric.nn import GraphSAGE
import torch
import torch.nn.functional as F

# Re-initialize the model for causal training
in_channels = data_list[0].num_node_features
hidden_channels = 32
out_channels = len(set(y))
model = GraphSAGE(in_channels, hidden_channels, out_channels).to("cpu")

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

# Causal training: Only train on current and previous windows
model.train()
for epoch in range(20):  # Number of epochs
    total_loss = 0
    for t in range(1, len(data_list)):  # Start from t=1 to allow causal (t-1)
        optimizer.zero_grad()

        # Simulate causal memory: train on window t-1
        past_data = data_list[t - 1].to("cpu")
        current_data = data_list[t].to("cpu")

        # Forward on past data
        out_past = model(past_data.x, past_data.edge_index)
        loss_past = loss_fn(out_past, past_data.y)

        # Forward on current data
        out_current = model(current_data.x, current_data.edge_index)
        loss_current = loss_fn(out_current, current_data.y)

        # Combined loss
        loss = loss_past + loss_current
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Causal Loss: {total_loss:.4f}")


Epoch 1, Causal Loss: 412.9249
Epoch 2, Causal Loss: 815.9091
Epoch 3, Causal Loss: 598.9709
Epoch 4, Causal Loss: 407.2735
Epoch 5, Causal Loss: 260.7712
Epoch 6, Causal Loss: 163.1069
Epoch 7, Causal Loss: 97.0076
Epoch 8, Causal Loss: 66.9272
Epoch 9, Causal Loss: 31.4280
Epoch 10, Causal Loss: 7.5416
Epoch 11, Causal Loss: 3.5096
Epoch 12, Causal Loss: 2.8721
Epoch 13, Causal Loss: 2.5626
Epoch 14, Causal Loss: 2.3151
Epoch 15, Causal Loss: 2.1079
Epoch 16, Causal Loss: 1.9299
Epoch 17, Causal Loss: 1.7746
Epoch 18, Causal Loss: 1.6376
Epoch 19, Causal Loss: 1.5156
Epoch 20, Causal Loss: 1.4063


Evaluation After Causal Training:

In [20]:
# Evaluate only on the last window (simulate unseen future)
model.eval()
correct = 0
total = 0

with torch.no_grad():
    eval_data = data_list[-1].to("cpu")  # Last window
    out = model(eval_data.x, eval_data.edge_index)
    pred = out.argmax(dim=1)
    correct += (pred == eval_data.y).sum().item()
    total += eval_data.num_nodes

print(f"Causal Evaluation Accuracy (on last window): {correct / total * 100:.2f}%")


Causal Evaluation Accuracy (on last window): 100.00%


Inject 10–20% Noise for Robustness Testing

We’ll inject:

Label noise: randomly flip labels for 10–20% of nodes.

Feature noise: add small Gaussian noise to features for 10–20% of nodes.

In [26]:
import copy
import random
import torch

def inject_noise(data_list, label_noise_ratio=0.1, feature_noise_ratio=0.1):
    noisy_data_list = []

    for data in data_list:
        data_noisy = copy.deepcopy(data)

        # Inject label noise
        num_nodes = data_noisy.y.shape[0]
        num_label_noise = int(label_noise_ratio * num_nodes)
        noisy_label_indices = random.sample(range(num_nodes), num_label_noise)

        for idx in noisy_label_indices:
            original_label = data_noisy.y[idx].item()
            possible_labels = list(set(data_noisy.y.tolist()))
            if len(possible_labels) <= 1:
                continue
            possible_labels.remove(original_label)
            if not possible_labels:
                continue
            new_label = random.choice(possible_labels)
            data_noisy.y[idx] = new_label

        # Inject feature noise
        num_feature_noise = int(feature_noise_ratio * num_nodes)
        noisy_feature_indices = random.sample(range(num_nodes), num_feature_noise)

        for idx in noisy_feature_indices:
            noise = torch.randn_like(data_noisy.x[idx]) * 0.1  # Adjust noise level if needed
            data_noisy.x[idx] += noise

        noisy_data_list.append(data_noisy)

    return noisy_data_list

# ✅ Apply noise
noisy_data_list = inject_noise(data_list, label_noise_ratio=0.1, feature_noise_ratio=0.1)
# Add time_window attribute to each data object
for t, data in enumerate(noisy_data_list):
    data.time_window = torch.full((data.num_nodes,), t, dtype=torch.long)


Modify GraphSAGE to Handle Causal Sampling:

In [27]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class CausalGraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(CausalGraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index, time_window):
        # Custom causal neighbor sampling by excluding edges based on time window
        row, col = edge_index

        # Filter edges based on the time window (ensure causal sampling)
        mask = time_window[row] <= time_window[col]
        edge_index = edge_index[:, mask]

        # Perform GraphSAGE forward pass using filtered edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x


Training Loop with Causal Training:

In [28]:
# Re-initialize the model
model = CausalGraphSAGE(in_channels, hidden_channels, out_channels).to("cpu")
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

# Causal training on noisy data
model.train()
for epoch in range(20):
    total_loss = 0
    for t in range(1, len(noisy_data_list)):
        optimizer.zero_grad()
        past_data = noisy_data_list[t - 1].to("cpu")
        current_data = noisy_data_list[t].to("cpu")

        # Pass time_window as an argument to the model
        out_past = model(past_data.x, past_data.edge_index, past_data.time_window)
        loss_past = loss_fn(out_past, past_data.y)

        out_current = model(current_data.x, current_data.edge_index, current_data.time_window)
        loss_current = loss_fn(out_current, current_data.y)

        loss = loss_past + loss_current
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"[Noise] Epoch {epoch+1}, Causal Loss: {total_loss:.4f}")


[Noise] Epoch 1, Causal Loss: 0.0000
[Noise] Epoch 2, Causal Loss: 0.0000
[Noise] Epoch 3, Causal Loss: 0.0000
[Noise] Epoch 4, Causal Loss: 0.0000
[Noise] Epoch 5, Causal Loss: 0.0000
[Noise] Epoch 6, Causal Loss: 0.0000
[Noise] Epoch 7, Causal Loss: 0.0000
[Noise] Epoch 8, Causal Loss: 0.0000
[Noise] Epoch 9, Causal Loss: 0.0000
[Noise] Epoch 10, Causal Loss: 0.0000
[Noise] Epoch 11, Causal Loss: 0.0000
[Noise] Epoch 12, Causal Loss: 0.0000
[Noise] Epoch 13, Causal Loss: 0.0000
[Noise] Epoch 14, Causal Loss: 0.0000
[Noise] Epoch 15, Causal Loss: 0.0000
[Noise] Epoch 16, Causal Loss: 0.0000
[Noise] Epoch 17, Causal Loss: 0.0000
[Noise] Epoch 18, Causal Loss: 0.0000
[Noise] Epoch 19, Causal Loss: 0.0000
[Noise] Epoch 20, Causal Loss: 0.0000


How to Verify:
After you inject noise and assign the time_window to each data object:

In [29]:
# Add time_window attribute to each data object
for t, data in enumerate(noisy_data_list):
    data.time_window = torch.full((data.num_nodes,), t, dtype=torch.long)

# Verify
for data in noisy_data_list:
    print(data.time_window)  # Should print a tensor with the same length as the number of nodes in each graph


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5])
tensor([6, 6, 6, 6, 6, 6, 6, 6, 6, 6])
tensor([7, 7, 7, 7, 7, 7, 7, 7, 7, 7])
tensor([8, 8, 8, 8, 8, 8, 8, 8, 8, 8])
tensor([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])
tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10])
tensor([11, 11, 11, 11, 11, 11, 11, 11, 11, 11])
tensor([12, 12, 12, 12, 12, 12, 12, 12, 12, 12])
tensor([13, 13, 13, 13, 13, 13, 13, 13, 13, 13])
tensor([14, 14, 14, 14, 14, 14, 14, 14, 14, 14])
tensor([15, 15, 15, 15, 15, 15, 15, 15, 15, 15])
tensor([16, 16, 16, 16, 16, 16, 16, 16, 16, 16])
tensor([17, 17, 17, 17, 17, 17, 17, 17, 17, 17])
tensor([18, 18, 18, 18, 18, 18, 18, 18, 18, 18])
tensor([19, 19, 19, 19, 19, 19, 19, 19, 19, 19])
tensor([20, 20, 20, 20, 20, 20, 20, 20, 20, 20])
tensor([21, 21, 21, 21, 21, 21, 21, 21, 21, 21])
tensor([22, 22, 22, 22

Experiment with Strict vs. Non-Strict Causality:
Non-Strict Causality (<=): Allows nodes to consider neighbors from the current window (i.e., nodes can influence each other within the same time window).

Strict Causality (<): Prevents nodes from considering neighbors from the same window (i.e., nodes can only influence past time windows).

Full Code for Training Loop with Causal Sampling:

Summary:
The time_window should be an attribute of each Data object.

When performing causal sampling during the training or forward pass, reference the time_window correctly using the data.time_window[row] format.

Correctly apply the mask based on your causality preference (<= for non-strict, < for strict).

In [36]:
# Assuming this is inside your model's forward pass or during training
for t, data in enumerate(noisy_data_list):
    data.time_window = torch.full((data.num_nodes,), t, dtype=torch.long)  # Set time_window if not already done

    # Your GraphSAGE processing here (with causal sampling logic)
    for row, col in zip(data.edge_index[0], data.edge_index[1]):
        # Apply causal mask depending on the choice of causality (strict vs non-strict)
        mask = data.time_window[row] <= data.time_window[col]  # Non-strict
        # mask = data.time_window[row] < data.time_window[col]  # Strict

        # Check if the mask for this edge is True
        if mask.item():  # Use .item() to get the boolean value from the tensor
            # Proceed with aggregation for valid neighbors
            # For example, aggregate features here for GraphSAGE
            pass


1. Aggregation of Valid Neighbors:
We will aggregate features from valid neighbors based on the causal mask. For simplicity, let's assume you're summing the features of valid neighbors. If you want to experiment with other aggregation methods like average or max, you can easily modify this logic.

2. Update Node Representations:
After aggregating the valid neighbors, we combine the node's features with the aggregated features (as typically done in GraphSAGE).

3. Complete Forward Pass:
The forward pass will involve processing each graph in noisy_data_list, applying the aggregation logic, and passing the updated node representations through the model.

4. Training Loop:
We’ll set up the training loop using your loss function (CrossEntropyLoss) and optimizer (Adam). We'll update the model's weights and compute the loss after each batch.

5. Evaluation:
After training, we’ll evaluate the model on a test set, and you can experiment with different metrics based on your task.

In [45]:
print(f"Shape of features in the first window: {noisy_data_list[0].x.shape}")

Shape of features in the first window: torch.Size([10, 55])


In [53]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split

class GraphSAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = torch.nn.Conv1d(2 * in_channels, hidden_channels, kernel_size=1)
        self.conv2 = torch.nn.Conv1d(hidden_channels, out_channels, kernel_size=1)
        self.in_channels = in_channels

    def forward(self, x, edge_index, time_window):
        x = x.unsqueeze(0).transpose(1, 2)
        aggregated_features = torch.zeros_like(x)
        for row, col in zip(edge_index[0], edge_index[1]):
            mask = time_window[row] <= time_window[col]
            if mask.item():
                aggregated_features[0, :, row] += x[0, :, col]
        aggregated_features = aggregated_features / (aggregated_features.sum(dim=2, keepdim=True) + 1e-8)
        combined_features = torch.cat([x, aggregated_features], dim=1)
        out = F.relu(self.conv1(combined_features))
        out = self.conv2(out)
        out = out.squeeze(0).transpose(0, 1)
        return out

def train(model, optimizer, loss_fn, train_loader, num_epochs=20):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for data in train_loader:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, data.time_window)
            loss = loss_fn(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

def evaluate(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in data_loader:
            out = model(data.x, data.edge_index, data.time_window)
            _, predicted = torch.max(out, dim=1)
            total += data.y.size(0)
            correct += (predicted == data.y).sum().item()
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")
    return accuracy

# Define hyperparameters
in_channels = 55
hidden_channels = 32
out_channels = 2
learning_rate = 0.01
batch_size = 1 # Process each window individually for simplicity

# Assuming you have your noisy_data_list
all_data = noisy_data_list

# Split the data into training and validation sets (80% train, 20% validation)
train_data_list, val_data_list = train_test_split(all_data, test_size=0.2, random_state=42, stratify=[data.y.cpu().numpy() for data in all_data])

print(f"Number of training samples: {len(train_data_list)}")
print(f"Number of validation samples: {len(val_data_list)}")

# Create data loaders (even if batch size is 1 for now)
train_loader = train_data_list
val_loader = val_data_list

# Initialize the model, optimizer, and loss function
model = GraphSAGE(in_channels, hidden_channels, out_channels)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

# Train the model on the training set
train(model, optimizer, loss_fn, train_loader, num_epochs=20)
torch.save(model.state_dict(), '/content/drive/MyDrive/Malware/malware_model.pth')
print("Trained model saved to 'malware_model.pth'")

# Evaluate the model on the validation set
evaluate(model, val_loader)

Number of training samples: 68
Number of validation samples: 17
Epoch 1, Loss: 2.0867
Epoch 2, Loss: 0.0034
Epoch 3, Loss: 0.0013
Epoch 4, Loss: 0.0010
Epoch 5, Loss: 0.0008
Epoch 6, Loss: 0.0007
Epoch 7, Loss: 0.0006
Epoch 8, Loss: 0.0005
Epoch 9, Loss: 0.0004
Epoch 10, Loss: 0.0004
Epoch 11, Loss: 0.0003
Epoch 12, Loss: 0.0003
Epoch 13, Loss: 0.0002
Epoch 14, Loss: 0.0002
Epoch 15, Loss: 0.0002
Epoch 16, Loss: 0.0002
Epoch 17, Loss: 0.0002
Epoch 18, Loss: 0.0001
Epoch 19, Loss: 0.0001
Epoch 20, Loss: 0.0001
Trained model saved to 'malware_model.pth'
Accuracy: 1.0000


1.0

Implementing Out-of-Distribution (OOD) Testing

---The approach here will be to treat the 80% training data as your "in-distribution" data and the 20% validation data as a limited proxy for "out-of-distribution" data. However, it's crucial to remember the limitations: this is not true OOD as it's still from the same original dataset.

