In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


In [7]:
def create_graph_data(features, labels):
    """
    Convert the dataset into a graph format suitable for GNN processing.
    - features: Feature matrix (samples x features).
    - labels: Target labels for classification.
    Returns:
        A PyTorch Geometric Data object.
    """
    # Create node features (X)
    x = torch.tensor(features.values, dtype=torch.float)

    # Create edge index (fully connected graph for simplicity)
    num_nodes = features.shape[0]
    edge_index = torch.tensor([
        [i, j] for i in range(num_nodes) for j in range(num_nodes)
    ]).t().contiguous()

    # Create labels
    y = torch.tensor(labels.values, dtype=torch.long)

    return Data(x=x, edge_index=edge_index, y=y)

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

def train_and_evaluate_gnn(data, train_mask, test_mask, epochs=100, learning_rate=0.01):
    """
    Train and evaluate the GNN model.
    - data: PyTorch Geometric Data object.
    - train_mask: Mask for training nodes.
    - test_mask: Mask for testing nodes.
    - epochs: Number of training epochs.
    - learning_rate: Learning rate for the optimizer.
    """
    model = GCN(input_dim=data.num_features, hidden_dim=16, output_dim=len(data.y.unique()))
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[train_mask], data.y[train_mask])
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item()}")

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        pred = model(data).argmax(dim=1)
        correct = pred[test_mask] == data.y[test_mask]
        accuracy = int(correct.sum()) / int(test_mask.sum())
        print(f"Test Accuracy: {accuracy:.4f}")

def process_dataset(file_path, target_column):
    """
    Preprocess the dataset and create a graph representation.
    - file_path: Path to the dataset.
    - target_column: Name of the target column.
    Returns:
        data: PyTorch Geometric Data object.
        train_mask: Mask for training nodes.
        test_mask: Mask for testing nodes.
    """
    # Load dataset
    df = pd.read_csv(file_path)

    # Separate features and target
    X = df.drop(columns=[df.columns[0], target_column])  # Exclude sample IDs and target column
    y = df[target_column]

    # Handle non-numeric columns by encoding them
    if X.select_dtypes(include=["object"]).shape[1] > 0:
        X = X.apply(pd.to_numeric, errors='coerce')

    # Handle missing values by filling them with the mean of the column
    X = X.fillna(X.mean())

    # Normalize features
    X = (X - X.mean()) / X.std()

    # Convert target labels to numeric
    y = pd.factorize(y)[0]

    # Split data into training and testing sets
    train_indices, test_indices = train_test_split(np.arange(len(y)), test_size=0.3, stratify=y, random_state=42)
    train_mask = torch.zeros(len(y), dtype=torch.bool)
    test_mask = torch.zeros(len(y), dtype=torch.bool)
    train_mask[train_indices] = True
    test_mask[test_indices] = True

    # Create graph data
    data = create_graph_data(X, pd.Series(y))

    return data, train_mask, test_mask

In [11]:
def gnn_pipeline_for_multiple_datasets_and_methods(datasets, target_column, feature_methods, epochs=100, learning_rate=0.01):
    """
    Perform GNN classification on multiple datasets and feature selection methods.
    - datasets: List of dataset file paths.
    - target_column: Name of the target column.
    - feature_methods: List of feature selection methods to process.
    - epochs: Number of training epochs.
    - learning_rate: Learning rate for the optimizer.
    """
    for dataset in datasets:
        print(f"\nProcessing dataset: {dataset}")
        for method in feature_methods:
            print(f"\n  Feature selection method: {method.replace('_selected.csv', '').upper()}")
            dataset_path = f"{dataset}{method}"
            data, train_mask, test_mask = process_dataset(dataset_path, target_column)
            print(f"Training and evaluating GNN on {dataset_path}...")
            train_and_evaluate_gnn(data, train_mask, test_mask, epochs=epochs, learning_rate=learning_rate)


In [12]:

# Example usage
if __name__ == "__main__":
    datasets = [
        "preprocessed/filtered_preprocessed_GSE4290",
        "preprocessed/filtered_preprocessed_GSE19804",
        "preprocessed/filtered_preprocessed_GSE27562",
        "preprocessed/filtered_preprocessed_GSE33315",
        "preprocessed/filtered_preprocessed_GSE59856"
    ]

    feature_methods = [
        "_std_mean_selected.csv",
        "_anova_selected.csv",
        "_chi2_selected.csv"
    ]

    target_column = "Target"  # Replace with the actual target column name

    print("Running GNN classification on multiple datasets and feature selection methods...")
    gnn_pipeline_for_multiple_datasets_and_methods(datasets, target_column, feature_methods)

Running GNN classification on multiple datasets and feature selection methods...

Processing dataset: preprocessed/filtered_preprocessed_GSE4290

  Feature selection method: _STD_MEAN
Training and evaluating GNN on preprocessed/filtered_preprocessed_GSE4290_std_mean_selected.csv...
Epoch 0, Loss: 1.3862944841384888
Epoch 10, Loss: 1.3248602151870728
Epoch 20, Loss: 1.2742712497711182
Epoch 30, Loss: 1.2678343057632446
Epoch 40, Loss: 1.267386794090271
Epoch 50, Loss: 1.265714406967163
Epoch 60, Loss: 1.2658941745758057
Epoch 70, Loss: 1.2655956745147705
Epoch 80, Loss: 1.2656352519989014
Epoch 90, Loss: 1.2655963897705078
Test Accuracy: 0.4340

  Feature selection method: _ANOVA
Training and evaluating GNN on preprocessed/filtered_preprocessed_GSE4290_anova_selected.csv...
Epoch 0, Loss: 1.3862944841384888
Epoch 10, Loss: 1.316877841949463
Epoch 20, Loss: 1.2736164331436157
Epoch 30, Loss: 1.2671407461166382
Epoch 40, Loss: 1.2674778699874878
Epoch 50, Loss: 1.265669822692871
Epoch 60,