# Configure environment

In [None]:
"""
Choose the dataset name (the dataset should be inside the folder /dataset in csv format)
The default dataset is: openml_203ds_datasets_matching
"""
dataset_name = "openml_203ds_datasets_matching"

"""
choose integer number of ratio negative/positive to sample (0 will use all negative pairs)
"""
neg_sample = 2
"""
Choose one split trategy ["isolation","random"] : 
- random will randomly spread positive node pairs in 80-20 fashion
- isolation will isolate 1 node from some topics in test (none pair in train will see these nodes).
The positive pairs will be splitted almost in 80-20%, like in the random case.
"""
strategy = "random"
"""
Choose to use the selected strategy to create a new split 
or reuse a previously created one (useful to repeat exact same experiment)
"""
create_new_split = False

"""
You can choose to use one of ["FASTTEXT","BERT"] as initial word_embedding encoding for the nodes in the datasets
"""
word_embedding_encoding = "FASTTEXT"

"""
These are the default values

dataset_name = "openml_203ds_datasets_matching"
neg_sample = 2
strategy = "random"
create_new_split = False #assumes splitted files exists already
word_embedding_encoding = "FASTTEXT"
"""
print("Env variables set")

#import libraries
from step3 import step3_gcnsm
from step3.step3_gcnsm import confusion_matrix as confusion_matrix
from step3.step3_gcnsm import train as train
from step3.step3_gcnsm import cross_validation as cross_validation
from step3.step3_gcnsm import test_mask, train_mask
from step3.step3_gcnsm import g
from step3 import step3_gcn_nn_concatenate as gcn_nn
from step3 import step3_gcn_loss as gcn_loss
from step3 import step3_gcn_training as gcn_training
from step3 import step3_plot_results as plot
# step3_gcnsm.load_env(ds_name=dataset_name,ns=neg_sample,st=strategy,sp=create_new_split,we=word_embedding_encoding)
print("\n SETUP IS READY")

In [None]:
gcn_nn.get_options()

# Choose NN architecture and loss function, then run tests

### Config and run training
### NN architectures: 

{<br>
    "0": "Bert_300", <br>
    "1": "Bert_300_300_200", <br>
    "2": "Bert_768", <br>
    "3": 'Fasttext2_150', <br>
    "4": "Fasttext3GCN_300" <br>
    "5": "Fasttext_150", <br>
    "6": "Fasttext_150_150_100", <br>
    "7": "Fasttext_300" <br>
}
### Loss functions: 
{<br>
    "0": "ContrastiveLoss", <br>
    "1": "CosineEmbeddingLoss", <br>
}

### Optimizer
{<br>
    "adam" (default)<br>
    "sgd"<br> 
}


### Loss functions parameters examples: format -> [margin]+[aggregation_function] 
{<br>
    0.9+mean, <br>
    0.7+mean, <br>
    0.5+mean, <br>
    0.3+mean, <br>
    0.9+sum, <br>
    0.7+sum, <br>
    0.5+sum, <br>
    0.3+sum, <br>
}

### batch_splits examples: 
{<br>
    64, <br>
    128, <br>
}
### learning rate examples (lr): 
{<br>
    1e-3, <br>
    1e-4, <br>
}

### Examples

In [None]:
# #load model from path
# training = gcn_training.Training()
# training.load_state(path="./models/[file_name].pt")
# train(training,iterations=N)

# #train new model and specify parameters
# training = gcn_training.Training()
# training.set_training(
#             net_name= gcn_nn.get_option_name(),  #_of_option for NN architecture
#             batch_splits= ,#_of_sets(this will (give dataset / batch_splits) size of batch
#             lr= , #learning rate for training (e.g. 1e-3 )
#             loss_name=gcn_loss.get_option_name(), #_of_option for loss 
#             loss_parameters= ,#loss function parameters separated by '+' e.g. for cosine and contrastive "0.0+mean"
#             optimizer_name= ) #adam or sgd, default adam
# train(training,iterations=N)

## Print confusion matrix and results using the training object
#confusion_matrix(training.net, step3_gcnsm.g, step3_gcnsm.g.ndata['vector'], step3_gcnsm.test_mask,training.loss_name,threshold = 0.2)

### Test suite

In [None]:
##Write here individual tests you want to run


### 10-fold Cross Validation

In [None]:
# #train new model and specify parameters
# training_object = gcn_training.Training()
# training_object.set_training(
#             net_name= gcn_nn.get_option_name(),  #_of_option for NN architecture
#             batch_splits= ,#_of_sets(this will (give dataset / batch_splits) size of batch
#             lr= , #learning rate for training (e.g. 1e-3 )
#             loss_name=gcn_loss.get_option_name(), #_of_option for loss 
#             loss_parameters= ,#loss function parameters separated by '+' e.g. for cosine and contrastive "0.0+mean"
#             optimizer_name= ) #adam or sgd, default adam

##cross_validation(training_object,num_of_training_iterations_per_fold)

## Plot results <br>

<p>This will plot charts of loss/accuracy for all the results that match the parameters options under the /results folder</p>

#### Parameters options

<p> Choose one of each and pass it to the corresponding plot function in the following order:

<b>1) neg_sample</b> = [1,2,3,4...etc] <br>
<b>2) db_name</b> = ["openml_203ds_datasets_matching"] <br>
<b>3) strategy</b> = ["isolation","random"] <br>
<b>4) archi</b> = ["Fasttext_150","Fasttext_300","Bert_300","Bert_768"] <br>
<b>5) optimizer</b> = ["adam","sgd"] <br>
<b>6) loss_functions</b> = ["ContrastiveLoss","CosineEmbeddingLoss"] <br>

#### Type of chart
<b>plot_by_loss_parameters:</b> groups in one chart the different results for loss functions parameters (margin) <br>
<b>plot_by_split </b>: groups in one chart the different results for size of batch splits <br>
<b>plot_cv </b>: plot the result of cross validation runs that were found

### Plot functions call example

In [None]:
opti = ["sgd"]
samp = [8]
for o in opti:
    for s in samp:
        plot.plot_by_loss_parameters(s,"openml_203ds_datasets_matching","isolation","Fasttext_150",o,"ContrastiveLoss")
#         plot.plot_by_split(2,"openml_203ds_datasets_matching","isolation","Fasttext_150","adam","ContrastiveLoss")
#         plot.plot_cv(2,"openml_203ds_datasets_matching","isolation","Fasttext_150","adam","ContrastiveLoss")

In [None]:
plot.plot_cv(2,"openml_203ds_datasets_matching","random","Fasttext_150","adam","ContrastiveLoss")