### model_name = 'VPrashant/cypher-gen' Use for cypher query generaion from text

In [3]:
import torch
from datasets import load_dataset

from transformers import Trainer, TrainingArguments
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [23]:
## Variables
model_name = "google-t5/t5-small"
train_data = "data/train.csv"

In [None]:
# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_name)
model = model.to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
# Prepare data
data = load_dataset('csv', data_files={'train': train_data})
data = data['train'].train_test_split(test_size=0.2, seed=42)

In [7]:
def preprocess(batch):
    # Handle all examples in the batch
    input_ids = []
    attention_mask = []
    labels = []
    
    for prompt, query in zip(batch['prompt'], batch['query']):
        # Tokenize input and output
        inputs = tokenizer("translate to Cypher: " + prompt, truncation=True, padding='max_length', max_length=128)
        outputs = tokenizer(query, truncation=True, padding='max_length', max_length=128)
        
        # Collect tokenized inputs and labels
        input_ids.append(inputs['input_ids'])
        attention_mask.append(inputs['attention_mask'])
        labels.append(outputs['input_ids'])
    
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


In [9]:
tokenized_data = data.map(preprocess, batched=True)
tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [10]:
# Fine-tune the model
training_args = TrainingArguments(
    output_dir='./cypher-model',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=300,
    weight_decay=0.01,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.692326
2,No log,0.120393
3,No log,0.025228
4,No log,0.014331
5,No log,0.009798
6,No log,0.006446
7,No log,0.004081
8,0.542300,0.003055
9,0.542300,0.002015
10,0.542300,0.001351


TrainOutput(global_step=19500, training_loss=0.014313454917954424, metrics={'train_runtime': 2698.9754, 'train_samples_per_second': 57.689, 'train_steps_per_second': 7.225, 'total_flos': 5268179622297600.0, 'train_loss': 0.014313454917954424, 'epoch': 300.0})

In [11]:
model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

('google-t5/t5-small/tokenizer_config.json',
 'google-t5/t5-small/special_tokens_map.json',
 'google-t5/t5-small/spiece.model',
 'google-t5/t5-small/added_tokens.json')

In [22]:
def test(test_input, model_name: str="google-t5/t5-small"):
    # Load fine-tuned model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # Set the model to evaluation mode
    model.eval()

    # Check if GPU is available and move model to GPU if possible
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input
    test_encoding = tokenizer(
        test_input,
        return_tensors="pt",
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    test_encoding = {key: val.to(device) for key, val in test_encoding.items()}  # Move input to device

    # Generate Cypher query
    with torch.no_grad():  # Disable gradient calculations for inference
        output = model.generate(
            input_ids=test_encoding['input_ids'],
            attention_mask=test_encoding['attention_mask'],
            max_length=128
        )
    
    # Decode the generated query
    generated_query = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print the generated Cypher query
    print("Input:", test_input)
    print("Generated Cypher Query:", generated_query)

# Call the test function
test_input = "give me a list of 10 genes associated with disease by overall direct indirect and score more than 0.5"
# test_input = "translate English to German: That is good."

test(test_input)


Input: give me a list of 10 genes associated with disease by overall direct indirect and score more than 0.5
Generated Cypher Query: MATCH (g:Gene)-[r:AssociationByOverallDirectIndirect]-(d:Disease) WHERE r.score > 0.5 RETURN g, d, r LIMIT 10
