In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import accuracy_score, f1_score
import logging

# 1. Prepare the dataset with additional user_data
data = {
    "question": [
        "What is the user's full name?",
        "When was the user born?",
        "What is the user's gender?",
        "What is the user's current education qualification?",
        "Which college does the user study in?",
        "When will the user graduate?",
        "What programming languages does the user know?",
        "Which frameworks and libraries is the user familiar with?",
        "Which tools and platforms does the user use?",
        "What soft skills does the user have?",
        "What are the user's key projects?",
        "Where can I find the user's GitHub profile?",
        "Where can I find the user's LinkedIn profile?",
        "What certifications does the user have?",
        "What are the user's research interests?",
        "What is the user's contact email?",
        "What competitions has the user participated in?",
        "What online courses has the user completed?",
        "What internships has the user done?"
    ],
    "sql_query": [
        "SELECT full_name FROM users WHERE id = 'Prem Kumar G';",
        "SELECT dob FROM users WHERE id = 'Prem Kumar G';",
        "SELECT gender FROM users WHERE id = 'Prem Kumar G';",
        "SELECT degree FROM users WHERE id = 'Prem Kumar G';",
        "SELECT college FROM users WHERE id = 'Prem Kumar G';",
        "SELECT graduation_year FROM users WHERE id = 'Prem Kumar G';",
        "SELECT programming_languages FROM skills WHERE user_id = 'Prem Kumar G';",
        "SELECT frameworks_libraries FROM skills WHERE user_id = 'Prem Kumar G';",
        "SELECT tools_platforms FROM skills WHERE user_id = 'Prem Kumar G';",
        "SELECT soft_skills FROM skills WHERE user_id = 'Prem Kumar G';",
        "SELECT project_name FROM projects WHERE user_id = 'Prem Kumar G';",
        "SELECT github_link FROM profiles WHERE user_id = 'Prem Kumar G';",
        "SELECT linkedin_link FROM profiles WHERE user_id = 'Prem Kumar G';",
        "SELECT certifications FROM achievements WHERE user_id = 'Prem Kumar G';",
        "SELECT research_interests FROM users WHERE id = 'Prem Kumar G';",
        "SELECT email FROM contacts WHERE user_id = 'Prem Kumar G';",
        "SELECT competitions FROM achievements WHERE user_id = 'Prem Kumar G';",
        "SELECT courses FROM education WHERE user_id = 'Prem Kumar G';",
        "SELECT internships FROM experience WHERE user_id = 'Prem Kumar G';"
    ],
    "user_data": [
        {"id": "Prem Kumar G", "full_name": "Prem Kumar G", "dob": "2000-01-01", "gender": "Male", "degree": "B.Tech", "college": "XYZ University", "graduation_year": "2022", "programming_languages": ["Python", "Java"], "frameworks_libraries": ["TensorFlow", "PyTorch"], "tools_platforms": ["GitHub", "VS Code"], "soft_skills": ["Teamwork", "Communication"], "projects": ["Project A", "Project B"], "github_link": "https://github.com/premkumar", "linkedin_link": "https://linkedin.com/in/premkumar", "certifications": ["Machine Learning - Coursera"], "research_interests": ["Natural Language Processing", "Deep Learning"], "email": "premkumar@example.com", "competitions": ["Hackathon 2021"], "courses": ["Deep Learning"], "internships": ["Data Science Internship"]},
        # Repeat the user data as needed
    ]
}

# Ensure the user_data list matches the length of the question list
num_questions = len(data["question"])
data["user_data"] = data["user_data"] * (num_questions // len(data["user_data"]))  # Repeat user_data to match question length

# Load dataset into pandas DataFrame
df = pd.DataFrame(data)

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['question'], df['sql_query'], test_size=0.2, random_state=42)

# 3. Load Pretrained T5 Model and Tokenizer
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Function to generate SQL query from user query
def generate_sql_query(question):
    model.eval()
    input_tokens = tokenizer(question, return_tensors='pt', padding=True, truncation=True)
    output = model.generate(**input_tokens)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Custom dataset class
class SQLDataset(Dataset):
    def __init__(self, questions, queries, user_data, tokenizer, max_length=128):
        self.questions = questions
        self.queries = queries
        self.user_data = user_data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions.iloc[idx]
        sql_query = self.queries.iloc[idx]
        user_info = self.user_data[idx]
        input_tokens = self.tokenizer(question, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        target_tokens = self.tokenizer(sql_query, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)

        return {
            'input_ids': input_tokens['input_ids'].squeeze(),
            'attention_mask': input_tokens['attention_mask'].squeeze(),
            'labels': target_tokens['input_ids'].squeeze(),
            'user_info': user_info  # Include user-specific data
        }

# Create DataLoader
train_dataset = SQLDataset(X_train, y_train, data['user_data'], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# 4. Setup Logging
logging.basicConfig(level=logging.INFO)

# 5. Train the model with checkpointing
def train_model():
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    epochs = 3
    best_loss = float('inf')

    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        logging.info(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

        # Save model checkpoint if it's the best so far
        if avg_loss < best_loss:
            best_loss = avg_loss
            model.save_pretrained("t5_sql_parser_best")
            logging.info(f"Saved model at epoch {epoch + 1} with loss {best_loss:.4f}")

# Train the model
train_model()

# 6. Evaluate the model with multiple metrics
def evaluate_model():
    model.eval()
    predictions = []

    with torch.no_grad():
        for i in range(len(X_test)):
            question = X_test.iloc[i]
            input_tokens = tokenizer(question, return_tensors='pt', padding=True, truncation=True)

            # Generate SQL query
            output = model.generate(**input_tokens)
            decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
            predictions.append(decoded_output)

    # BLEU score evaluation
    bleu_scores = []
    for pred, true in zip(predictions, y_test):
        bleu_scores.append(sentence_bleu([true.split()], pred.split()))

    avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
    logging.info(f"Average BLEU score: {avg_bleu_score:.2f}")

    # Accuracy and F1 score evaluation
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    logging.info(f"Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f}")

    return predictions

# Get the predictions
evaluate_model()

# Example usage of query function
user_query = "What programming languages does the user know?"
print("Generated SQL Query:""What is the user's full name?", generate_sql_query(user_query))

# 7. Function to respond with full details
def respond_to_query(query):
    """Respond with the user's full details based on the query."""
    # Define a mapping of keywords in the query to user data fields
    query_to_field_map = {
        "full name": "full_name",
        "born": "dob",
        "gender": "gender",
        "degree": "degree",
        "college": "college",
        "graduation": "graduation_year",
        "programming languages": "programming_languages",
        "frameworks": "frameworks_libraries",
        "tools": "tools_platforms",
        "soft skills": "soft_skills",
        "projects": "projects",
        "github": "github_link",
        "linkedin": "linkedin_link",
        "certifications": "certifications",
        "research interests": "research_interests",
        "email": "email",
        "competitions": "competitions",
        "courses": "courses",
        "internships": "internships"
    }

    for keyword, field in query_to_field_map.items():
        if keyword.lower() in query.lower():
            # Return the corresponding user data field value
            return data["user_data"][0].get(field, "Not available.")

    return "Sorry, I couldn't find a response to that query."

# Example usage of the respond_to_query function
query = "What is the user's full name?"
print("Response:", respond_to_query(query))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Generated SQL Query:What is the user's full name? 
Response: Prem Kumar G
