In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

import ray
from ray import tune
from ray.tune.tune_config import TuneConfig
from ray.air.config import RunConfig
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

from ray.air import session
from ray.air.checkpoint import Checkpoint

In [3]:
# Define the storage path for Ray Tune results
storage_path = "./ray_results"
exp_name = "tune_analyzing_results_001"

torch.cuda.is_available()

True

In [4]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes, num_neurons, network_size):
        super(GCN, self).__init__()

        self.conv_layers = torch.nn.ModuleList()
        self.conv_layers.append(GCNConv(num_features, num_neurons))
        for _ in range(network_size - 1):
            self.conv_layers.append(GCNConv(num_neurons, num_neurons))
        self.conv_layers.append(GCNConv(num_neurons, num_classes))

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr

        for i, conv_layer in enumerate(self.conv_layers):
            if i != len(self.conv_layers) - 1:
                x = F.relu(conv_layer(x, edge_index, edge_weight))
            else:
                x = conv_layer(x, edge_index, edge_weight)

        return F.log_softmax(x, dim=1)


        # for conv in self.conv_layers[:-1]:
        #     x = conv(x, edge_index)
        #     x = x.relu()
        #
        # x = self.conv_layers[-1](x, edge_index)
        #
        # return x.log_softmax(dim=1)


In [5]:
def train_gcn(config, data):
    num_neurons = int(config["num_neurons"].sample())
    network_size = int(config["network_size"].sample())
    lr = float(config["lr"].sample())
    weight_decay = float(config["weight_decay"].sample())
    # epoch_num = int(config["epoch_num"].sample())

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GCN(data.num_node_features, int(data.y.max() + 2), num_neurons, network_size).to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    epochs = 500
    start = 0
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        last_step = loaded_checkpoint.to_dict()["step"]
        start = last_step + 1

    checkpoint_freq = 50
    model.train()
    for epoch in range(start, epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        model.eval()
        pred = model(data).argmax(dim=1)
        correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
        acc = int(correct) / int(data.test_mask.sum())

        metrics = {"accuracy": acc, "loss": loss.item()}
        if epoch % checkpoint_freq == 0:
            checkpoint = Checkpoint.from_dict({"epoch": epoch})
            session.report(metrics, checkpoint=checkpoint)
        else:
            session.report(metrics)


In [6]:
# Load dataset
data = torch.load("datasets/data.pt")

# Define the search space for hyperparameters
config = {
    "num_neurons": tune.choice([32, 64, 128]),
    "network_size": tune.choice([2, 3, 4]),
    "lr": tune.loguniform(1e-4, 1e-1),
    "weight_decay": tune.loguniform(1e-6, 1e-3),
    # "epoch_num": tune.choice([100, 250, 500])
}

# Define the trainable function for Ray Tune
trainable_with_resources = tune.with_resources(lambda trainable: train_gcn(config, data), {"cpu": 8, "gpu": 1})

In [7]:
ray.shutdown()
ray.init()

# Define the metric and mode for the ASHAScheduler
metric = "accuracy"
mode = "max"

# Perform hyperparameter tuning
# Perform hyperparameter tuning using Ray Tune
reporter = CLIReporter(metric_columns=["accuracy"])
scheduler = ASHAScheduler(metric=metric, mode=mode, max_t=500, grace_period=20)

# store results using tune.Tuner
tuner = tune.Tuner(
    trainable_with_resources,
    param_space= {
        "params": config
    },
    tune_config=TuneConfig(
        num_samples=50,
        # time_budget_s=600.0,
        scheduler=scheduler,
    ),
    run_config=RunConfig(
        name=exp_name,
        storage_path=storage_path,  # Specify a directory to store results
        progress_reporter=reporter,
    ),
)
result = tuner.fit()
best_trial = result.get_best_result("accuracy", mode="max", scope="last")
best_hyperparameters = best_trial.config
best_accuracy = best_trial.metrics

print("Best hyperparameters found:")
print(best_hyperparameters)
print("Best accuracy found:", best_accuracy)

ray.shutdown()

2023-07-07 15:05:15,891	INFO worker.py:1636 -- Started a local Ray instance.


== Status ==
Current time: 2023-07-07 15:05:19 (running for 00:00:00.95)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 320.000: None | Iter 80.000: None | Iter 20.000: None
Logical resource usage: 8.0/8 CPUs, 1.0/1 GPUs
Result logdir: C:\Users\Shamit\ray_results\tune_analyzing_results_001
Number of trials: 16/50 (16 PENDING)
+--------------------+----------+-------+-------------+-----------------------+----------------------+-----------------------+
| Trial name         | status   | loc   |   params/lr |   params/network_size |   params/num_neurons |   params/weight_decay |
|--------------------+----------+-------+-------------+-----------------------+----------------------+-----------------------|
| lambda_98275_00000 | PENDING  |       | 0.0207552   |                     2 |                   32 |           6.1951e-06  |
| lambda_98275_00001 | PENDING  |       | 0.0158372   |                     4 |                   64 |           5.25682e-05 |
| lambda_98275_00002 | PENDING  | 

Trial name,accuracy,date,done,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id
lambda_98275_00000,0.568557,2023-07-07_15-05-26,False,BasementPC,1,3.52746,127.0.0.1,29908,True,2.16504,2.16504,2.16504,1688760326,1,98275_00000


== Status ==
Current time: 2023-07-07 15:05:29 (running for 00:00:10.97)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 320.000: None | Iter 80.000: 0.6254122820794723 | Iter 20.000: 0.5762525522223967
Logical resource usage: 8.0/8 CPUs, 1.0/1 GPUs
Result logdir: C:\Users\Shamit\ray_results\tune_analyzing_results_001
Number of trials: 17/50 (16 PENDING, 1 RUNNING)
+--------------------+----------+-----------------+-------------+-----------------------+----------------------+-----------------------+------------+
| Trial name         | status   | loc             |   params/lr |   params/network_size |   params/num_neurons |   params/weight_decay |   accuracy |
|--------------------+----------+-----------------+-------------+-----------------------+----------------------+-----------------------+------------|
| lambda_98275_00000 | RUNNING  | 127.0.0.1:29908 | 0.0207552   |                     2 |                   32 |           6.1951e-06  |   0.663264 |
| lambda_98275_00001 | PENDING

2023-07-07 15:06:44,740	INFO tune.py:1111 -- Total run time: 86.76 seconds (86.37 seconds for the tuning loop).


== Status ==
Current time: 2023-07-07 15:06:44 (running for 00:01:26.62)
Using AsyncHyperBand: num_stopped=50
Bracket: Iter 320.000: 0.7750903094078844 | Iter 80.000: 0.6401759070205748 | Iter 20.000: 0.5680069106329511
Logical resource usage: 8.0/8 CPUs, 1.0/1 GPUs
Result logdir: C:\Users\Shamit\ray_results\tune_analyzing_results_001
Number of trials: 50/50 (50 TERMINATED)
+--------------------+------------+-----------------+-------------+-----------------------+----------------------+-----------------------+------------+
| Trial name         | status     | loc             |   params/lr |   params/network_size |   params/num_neurons |   params/weight_decay |   accuracy |
|--------------------+------------+-----------------+-------------+-----------------------+----------------------+-----------------------+------------|
| lambda_98275_00000 | TERMINATED | 127.0.0.1:29908 | 0.0207552   |                     2 |                   32 |           6.1951e-06  |   0.793937 |
| lambda_98275_

In [11]:
# restored_tuner = tune.Tuner.restore(experiment_path, trainable=trainable_with_resources)
# restored_tuner.get_results()

results_grid = tuner.get_results()
results_grid.errors

[]

In [31]:
%%javascript
Jupyter.keyboard_manager.disable()

<IPython.core.display.Javascript object>

In [32]:

experiment_path = f"{storage_path}/{exp_name}"
print(f"Loading results from {experiment_path}...")
%reload_ext tensorboard
%tensorboard --logdir $experiment_path

Loading results from ./ray_results/tune_analyzing_results_001...


Reusing TensorBoard on port 6006 (pid 23640), started 0:16:40 ago. (Use '!kill 23640' to kill it.)