In [216]:
import os
import torch
os.environ['TORCH'] = torch.__version__
from torch_geometric.data import Data
from torch_geometric.nn import GATv2Conv

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

Dataframe_Labels = pd.read_csv("../BLCA_DATA/Workspace/labels_str.csv")
Dataframe_link = pd.read_csv("../BLCA_DATA/Workspace/patient_norm.csv")
Dataframe_node = pd.read_csv("../BLCA_DATA/Workspace/node_embedding.csv")

Dataframe_Labels['class_int'], uniques = pd.factorize(Dataframe_Labels['class'])

Dataframe_Labels = Dataframe_Labels[~Dataframe_Labels['class_int'].isin([4, 5])]

patients_to_keep = Dataframe_Labels['Patient'].unique()

Dataframe_link = Dataframe_link[Dataframe_link['Patient'].isin(patients_to_keep)]
Dataframe_node = Dataframe_node[Dataframe_node['Patient'].isin(patients_to_keep)]

Dataframe_Labels = Dataframe_Labels.reset_index(drop=True)
Dataframe_link = Dataframe_link.reset_index(drop=True)
Dataframe_node = Dataframe_node.reset_index(drop=True)

In [217]:
classes_dict = {0: 'LumP', 
                1: 'Ba/Sq', 
                2: 'LumU', 
                3: 'Stroma-rich'
}

def count_classes_weights(tensor):
    array = tensor.numpy()
    classes_tab = { 0: 0, 
                    1: 0,
                    2: 0, 
                    3: 0
    }
    for i in array:
        classes_tab[i]+=1

    mean_nb_classes = 0
    for i in classes_tab:
        mean_nb_classes += i
    mean_nb_classes *= 1/len(classes_tab)
    
    # normalize the weights
    weight_sum = 0
    for i in range(len(classes_tab)):
        if classes_tab[i] != 0:
            weight_sum += mean_nb_classes / classes_tab[i]
    alpha = 1 / weight_sum

    weight_dict = {}
    used_classes = []
    for i in range(len(classes_tab)):
        if classes_tab[i] != 0:
            weight_dict[i] = alpha * (mean_nb_classes / classes_tab[i]) *50
            used_classes.append(classes_dict[i])

    return used_classes, weight_dict

In [218]:
node_features = Dataframe_node.drop(columns=['Patient']).values
node_features = torch.tensor(node_features, dtype=torch.float)

def get_data(indices, similarity_threshold, Dataframe_link, Dataframe_Labels, num_classes=4):
    x_data = node_features[indices]

    patient_similarity = cosine_similarity(Dataframe_link.iloc[:, 1:])
    similarity_threshold = 0.5  # Exemple de seuil de similarité    

    # Calculate the edges and attention ridges for training
    edge_index = []
    edge_attr = []
    re_indexed_i = 0
    re_indexed_j = 0
    for i in indices:
        for j in indices:
            if i >= j :
                break
            if patient_similarity[i, j] > similarity_threshold:
                edge_index.append([re_indexed_i, re_indexed_j])
                edge_attr.append((patient_similarity[i, j] - similarity_threshold)/(1 - similarity_threshold))
            re_indexed_j +=1
        re_indexed_i +=1
        re_indexed_j = 0

    node_labels = Dataframe_Labels["class_int"].values
    labels = torch.tensor(node_labels[indices], dtype=torch.long)

    node_labels = torch.tensor(node_labels, dtype=torch.long)

    used_classes, weight_dict = count_classes_weights(node_labels)
    Dataframe_Labels['weight'] = [weight_dict[x] for x in Dataframe_Labels['class_int']]
    node_weights = torch.tensor(Dataframe_Labels['weight'], dtype=torch.float)

    edge_features = torch.tensor(x_data, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.int64).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    node_weights = node_weights[indices]

    data = Data(
    x=edge_features, 
    edge_index=edge_index, 
    edge_attr=edge_attr, 
    y=labels, 
    weights=node_weights, 
    num_classes=num_classes,
    num_nodes = len(edge_features),
    num_features = edge_features.shape[1],
    )

    return data


In [219]:
from sklearn.model_selection import train_test_split
import random
import numpy as np

similarity_threshold = 0.5
test_size = 0.2
n_graphs = 5  # Nombre de divisions souhaitées pour train_val_indices

# Étape 1: Diviser les indices en ensembles train/validation et test
train_val_indices, test_indices = train_test_split(
    range(len(node_features)), 
    test_size=test_size, 
    random_state=random.randint(0, 1000)
)

# Étape 2: Diviser train_val_indices en n listes d'indices
def split_indices(indices, n_graphs):
    np.random.shuffle(indices)  # Mélanger les indices pour une division plus aléatoire
    tabs =  np.array_split(indices, n_graphs)
    for i in tabs:
        print(i)
    return tabs

# Appliquer la fonction de division
split_train_val_indices = split_indices(train_val_indices, n_graphs)

# Affichage des résultats
for i, indices in enumerate(split_train_val_indices):
    print(f"Subset {i+1}: {indices}, {len(indices)}")


[170  55 224 171 270  70 180 215 337 353 285 304 198 330 227 143 243  56
 133 305  95 105 163  18  51 272 200  35 126 366  31 362 275 257  96  71
 232 342 176 332  34 131 355 315 233  14  28 209 311 132 129 139 174 158
 328  57 298 231 324 207   6]
[ 88  81 203 277  60 334 264 356  12 291  69 250 115 101 325 216 236 169
 318 225 333 189   1 107 211 259 300 187 290 214  63 217 103 363 357 242
 279 276 175  92  91 278 222 299 283 296 316 140 119 226 218 254  58 266
  29  85 141 156  30  66 248]
[220 369 313 349 195 208 149 237  61  45 309 128 306 219 108 252 370 102
 114 323 302  50 173 190 210 199 314  86 183 186 281 161 365  36 188   5
 162 335  32 240 229 235 168  33 262 241  72  59   7 135 282 127 256  87
 289  62  38 371 308 194]
[ 16  76 267 113 204 360 155 255 284  25 117 123  99 271   8  75 116 246
  27  84 159 336 125 205  54 307 228 273 303 165 124 153 375  65 345  83
 213 253 344  42  41 196 206 144 172  15 193 260 341 339  89 166 244 182
 373  37  46 191 326 359]
[374 331 100

In [220]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATv2Conv
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold

class GATv2(torch.nn.Module):
    def __init__(self, hidden_channels, heads, data):
        super(GATv2, self).__init__()
        torch.manual_seed(1234)
        self.convs = torch.nn.ModuleList()
        self.convs.append(GATv2Conv(data.num_features, hidden_channels, heads=heads, edge_dim=1))
        self.convs.append(GATv2Conv(hidden_channels * heads, data.num_classes, edge_dim=1))
        
    def forward(self, x, edge_index, edge_attr):
        for conv in self.convs[:-1]:
            x = F.dropout(x, p=0.6, training=self.training)
            x = conv(x, edge_index, edge_attr)
            x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.convs[-1](x, edge_index, edge_attr)
        return x

def weighted_cross_entropy_loss(output, target, weights):
    loss = F.cross_entropy(output, target, reduction='none')
    weighted_loss = loss * weights[target]
    return weighted_loss.mean()

def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index, data.edge_attr)
    loss = weighted_cross_entropy_loss(out[data.train_mask], data.y[data.train_mask], data.weights[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss, model

def test(model, data, mask):
    model.eval()
    out = model(data.x, data.edge_index, data.edge_attr)
    pred = out.argmax(dim=1)
    correct = pred[mask] == data.y[mask]
    acc = int(correct.sum()) / int(mask.sum())
    return acc, pred[mask]

In [221]:
similarity_threshold = 0

train_val_indices, test_indices = train_test_split(
    range(len(node_features)), 
    test_size=0.2, 
    random_state=42
)

train_val_data = get_data(train_val_indices, similarity_threshold, Dataframe_link, Dataframe_Labels, num_classes=4)
test_data = get_data(test_indices, similarity_threshold, Dataframe_link, Dataframe_Labels, num_classes=4)

print(train_val_data)
print(test_data)

def objective(trial):
    # Hyperparameters to be optimized
    hidden_channels = trial.suggest_int('hidden_channels', 15, 45)
    heads = trial.suggest_int('heads', 1, 25)
    n_graphs = trial.suggest_int('n_graphs', 2, 10)

    #define the attributes
    data = train_val_data
    data_test = test_data
    num_epochs = 10
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)
    all_test_acc = []

    for fold, (_, _) in enumerate(skf.split(data.x, data.y)):

        split_train_val_indices = split_indices(data.y, n_graphs)
        graph = 1
        data_graph_init = get_data(split_train_val_indices[0], similarity_threshold, Dataframe_link, Dataframe_Labels, num_classes=4)

        model = GATv2(hidden_channels=hidden_channels, heads=heads, data=data_graph_init)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

        # Appliquer la fonction de division
        for node_in_each_graph in split_train_val_indices:
            print(node_in_each_graph)
            
            data_graph = get_data(node_in_each_graph, similarity_threshold, Dataframe_link, Dataframe_Labels, num_classes=4)
            print(data_graph.edge_index)
            # Étape 1: Diviser les indices en ensembles train/validation
            train_index, val_index = train_test_split(
                range(len(node_in_each_graph)), 
                test_size=0.2, 
                random_state=random.randint(0,1000)
            )
            print("o")
            # Define masks
            data_graph.train_mask = torch.zeros(data_graph.num_nodes, dtype=torch.bool)
            data_graph.train_mask[train_index] = True
            
            data_graph.val_mask = torch.zeros(data_graph.num_nodes, dtype=torch.bool)
            data_graph.val_mask[val_index] = True

            # Training loop
            for epoch in range(1, num_epochs):
                loss, model = train(model, data_graph, optimizer)
                train_acc, _ = test (model, data_graph, data_graph.train_mask)
                val_acc, _ = test(model, data_graph, data_graph.val_mask)
                if epoch % 50 == 0:
                    print(f'Graph: {graph}, Fold: {fold + 1}, Epoch: {epoch:03d}, Loss: {loss:.4f}, Train_acc {train_acc:.4f}, Val_acc {val_acc:.4f}')
            graph += 1
        
        test_mask = torch.ones(data_test.num_nodes, dtype=torch.bool)
        test_acc, _ = test(model, data_test, test_mask)

        # Store results
        all_test_acc.append(test_acc)

    # Calculate mean test accuracy for all folds
    mean_test_acc = np.mean(all_test_acc)
    return mean_test_acc


Data(x=[302, 825], edge_index=[2, 661], edge_attr=[661], y=[302], weights=[302], num_classes=4, num_nodes=302, num_features=825)
Data(x=[76, 825], edge_index=[2, 242], edge_attr=[242], y=[76], weights=[76], num_classes=4, num_nodes=76, num_features=825)


  edge_features = torch.tensor(x_data, dtype=torch.float)
  edge_features = torch.tensor(x_data, dtype=torch.float)


In [222]:
# Run the optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

# Print the best parameters
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-07-16 17:10:21,326] A new study created in memory with name: no-name-7f477090-6641-494a-b60a-5da87bfdfa1b
  np.random.shuffle(indices)  # Mélanger les indices pour une division plus aléatoire
  edge_features = torch.tensor(x_data, dtype=torch.float)
  edge_features = torch.tensor(x_data, dtype=torch.float)
[W 2024-07-16 17:10:21,353] Trial 0 failed with parameters: {'hidden_channels': 26, 'heads': 19, 'n_graphs': 10} because of the following error: IndexError('index 0 is out of bounds for dimension 0 with size 0').
Traceback (most recent call last):
  File "/home/remik/Documents/Cassiopée/cassiopee-projet/Cass/lib/python3.12/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_56226/2131357062.py", line 60, in objective
    loss, model = train(model, data_graph, optimizer)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_56226/1216009520.py", line 

tensor([0, 0, 0, 3, 1, 1, 1, 3, 0, 1, 0, 2, 0, 0, 1, 1, 3, 3, 0, 0, 1, 0, 1, 2,
        1, 0, 2, 2, 0, 2, 1])
tensor([3, 3, 0, 3, 0, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 0, 3, 2, 1,
        3, 1, 0, 1, 2, 0, 0])
tensor([1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 2, 1, 0, 3, 2, 1, 2, 2, 1, 1, 1, 1, 0, 3,
        2, 1, 2, 2, 1, 1])
tensor([1, 2, 1, 3, 3, 1, 0, 0, 0, 0, 2, 1, 1, 1, 0, 2, 1, 3, 2, 1, 0, 1, 2, 2,
        0, 1, 1, 2, 1, 1])
tensor([1, 1, 1, 2, 1, 3, 3, 1, 0, 2, 3, 1, 3, 1, 2, 2, 1, 0, 2, 0, 3, 0, 1, 1,
        0, 2, 1, 2, 0, 1])
tensor([2, 1, 0, 1, 0, 1, 0, 3, 1, 1, 0, 1, 0, 0, 2, 3, 2, 1, 1, 2, 3, 3, 0, 1,
        1, 1, 0, 1, 1, 0])
tensor([0, 0, 1, 2, 2, 1, 1, 1, 0, 1, 2, 1, 2, 0, 1, 1, 1, 3, 1, 0, 2, 0, 0, 1,
        3, 1, 1, 1, 3, 0])
tensor([1, 1, 1, 3, 1, 2, 1, 1, 0, 2, 3, 0, 0, 1, 0, 3, 1, 1, 1, 3, 3, 1, 0, 0,
        2, 0, 0, 2, 1, 2])
tensor([1, 2, 0, 1, 2, 1, 1, 2, 1, 1, 3, 1, 1, 3, 1, 3, 2, 2, 1, 2, 1, 2, 2, 3,
        0, 1, 1, 0, 0, 3])
tensor([1, 1, 1, 1, 2, 0, 0, 2,

IndexError: index 0 is out of bounds for dimension 0 with size 0