In [None]:
# prompt: mount the drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration,TrainingArguments, Trainer
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import gc  # Import the garbage collection module
import torch.nn.functional as F


In [None]:
!pip install sentencepiece
!pip install accelerate -U



In [None]:
import os
import json

# Define the path to your JSON file
json_file_path = '/content/drive/My Drive/DL/sql/spider/spider/train_spider.json'

# Load the JSON data
with open(json_file_path, 'r') as json_file:
    train_data = json.load(json_file)
db_ids_set = set(map(lambda item: item['db_id'], train_data))
base_folder = '/content/drive/My Drive/DL/sql/spider/spider/database'

schema_dict = {}

# Iterate through each db_id in the set
for db_id in db_ids_set:

    # Define the path to the schema file
    file_path = os.path.join(base_folder, db_id, 'schema' + '.sql')

    if os.path.exists(file_path):
            # Read the file as binary and decode it to a string
            with open(file_path, 'rb') as binary_file:
                # Assume the entire content of the binary file is the schema definition
                schema_definition = binary_file.read().decode('utf-8')  # Adjust the encoding as needed

                # Store the schema in the dictionary
                schema_dict[db_id] = schema_definition

for db_id in db_ids_set:

    # Define the path to the schema file
    file_path = os.path.join(base_folder, db_id, db_id + '.sql')

    if os.path.exists(file_path):
            # Read the file as binary and decode it to a string
            with open(file_path, 'rb') as binary_file:
                # Assume the entire content of the binary file is the schema definition
                schema_definition = binary_file.read().decode('utf-8')  # Adjust the encoding as needed

                # Store the schema in the dictionary
                schema_dict[db_id] = schema_definition
# Define the path to your JSON file
dev_json_file_path = '/content/drive/My Drive/DL/sql/spider/spider/dev.json'

# Load the JSON data
with open(json_file_path, 'r') as json_file:
    val_data = json.load(json_file)


In [None]:
# prompt: f"translate SQL: {sample['question']} into SQL with context schema: {schema}" map train_data and convert that to given format text

input_texts = list(map(lambda sample: f"translate SQL: {sample['question']} into SQL with context schema: {schema_dict.get(sample['db_id'])}", train_data))



In [None]:
max_size = max(map(lambda text: len(text), input_texts))
print(max_size)


322070258


In [None]:
from torch.utils.data import Dataset

class SQLDatasetWithSchema(Dataset):
    def __init__(self, data, schema_dict, tokenizer):
        self.data = data
        self.schema_dict = schema_dict
        self.tokenizer = tokenizer

        # Store indices of valid samples
        self.valid_indices = [idx for idx, sample in enumerate(self.data) if self.is_valid_sample(sample)]

    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        # Use the valid indices to access the corresponding sample
        sample = self.data[self.valid_indices[idx]]
        schema = self.schema_dict.get(sample['db_id'])

        input_text = f"translate SQL: {sample['question']} into SQL with context schema: {schema}"
        target_text = sample['query']

        # Tokenize and encode
        input_ids = self.tokenizer.encode(input_text, return_tensors='pt', max_length=2048, truncation=True)
        target_ids = self.tokenizer.encode(target_text, return_tensors='pt', max_length=512, truncation=True)

        return {
            'input_ids': input_ids.squeeze(),
            'labels': target_ids.squeeze(),
        }

    def is_valid_sample(self, sample):
        # Define your criteria for a valid sample here
        schema = self.schema_dict.get(sample['db_id'])
        question = sample['question']
        query = sample['query']
        return schema is not None and question is not None and query is not None  # Adjust this condition based on your requirements



In [None]:
# Initialize the T5 tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
tokenizer = T5Tokenizer.from_pretrained('t5-small')

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


# Create instances of the custom dataset for training and validation
train_dataset = SQLDatasetWithSchema(train_data,schema_dict, tokenizer)
val_dataset = SQLDatasetWithSchema(val_data, schema_dict,tokenizer)

def collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], padding_value=tokenizer.pad_token_id)

    # Check if 'target_ids' key is present in the batch items
    if 'labels' in batch[0]:
        target_ids = pad_sequence([item['labels'] for item in batch], padding_value=tokenizer.pad_token_id)
        return {
            'input_ids': input_ids,
            'labels': target_ids,
        }
    else:
        # If 'target_ids' key is not present, return only 'input_ids'
        return {
            'input_ids': input_ids,
        }

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn)



OutOfMemoryError: ignored

In [None]:
training_args = TrainingArguments(
    output_dir='./text_to_sql_model_with_schema',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    save_steps=1000,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    logging_first_step=True,
    load_best_model_at_end=True,  # Load the best model at the end of training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=collate_fn,  # Use your collate function here
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained('text_to_sql_model_with_schema')
tokenizer.save_pretrained('text_to_sql_model_with_schema')


OutOfMemoryError: ignored