In [None]:
!pip install transformers
!pip install pinecone-client
!pip install sentence_transformers
!pip install sentencepiece
!pip install keras-nlp -q
!pip install tensorflow --upgrade
!pip install torch
!pip install pandas 

In [None]:
import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration, TFAutoModelForQuestionAnswering
from transformers import pipeline, set_seed
import pandas as pd
from tensorflow import keras

In [None]:
# General hyperparameters
BATCH_SIZE = 32
NUM_BATCHES = 500
EPOCHS = 1   # Can be set to a higher value for better results
MAX_SEQUENCE_LENGTH = 128
MAX_GENERATION_LENGTH = 200

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
csv_file_path = "/content/drive/My Drive/dialogue_pairs.csv"
ubuntu_dialogues = pd.read_csv(csv_file_path)

In [None]:
import pinecone
#Connect to vector database 
pinecone.init(api_key="c72b02c0-62fa-4ee2-aa36-6c2384fed7e1", environment="gcp-starter")
pinecone.list_indexes()
index = pinecone.Index("ubuntu-ir-jina")

In [None]:
from transformers import AutoModel
from numpy.linalg import norm
#Load pre-trained model for generation of word embeddings 
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True) # trust_remote_code is needed to use the encode method

In [None]:
context_list = []
#Retrieve topmost similar embedding to query 
for _, row in ubuntu_dialogues.iterrows():
    question = row['Question']
    answer = row['Answer']
    embedding = embedding_model.encode([question]).tolist()
    result = index.query(
    vector = embedding,
    top_k=1,
    include_metadata=True
    )
    context = str(result['matches'][0]['metadata']['text'])
    context_list.append(context)

ubuntu_dialogues['Context'] = context_list

In [None]:
# Initialize the T5 tokenizer and model
model_name = "t5-small"  # You can choose a different T5 variant
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
#Preprocessing of input to model 
class QADataSet(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        inputs = self.tokenizer.encode_plus(
            row['Question'], row['Context'], padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH)
        targets = self.tokenizer.encode_plus(
            row['Answer'], padding='max_length', truncation=True, max_length=MAX_SEQUENCE_LENGTH)
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'decoder_input_ids': torch.tensor(targets['input_ids'], dtype=torch.long),
            'decoder_attention_mask': torch.tensor(targets['attention_mask'], dtype=torch.long),
        }

dataset = QADataSet(ubuntu_dialogues, tokenizer)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
import torch.optim as optim
from torch.nn import CrossEntropyLoss
#Training model  
optimizer = optim.Adam(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

for epoch in range(EPOCHS):
    running_loss = 0.0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        decoder_input_ids = batch['decoder_input_ids']
        decoder_attention_mask = batch['decoder_attention_mask']
        labels = decoder_input_ids[:, :].contiguous() #Including CLS token

        outputs = model(input_ids, attention_mask=attention_mask,
                        decoder_input_ids=decoder_input_ids,
                        decoder_attention_mask=decoder_attention_mask,
                        labels=labels)
        loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)),
                         labels.view(-1))

        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    epoch_loss = running_loss / len(dataloader)
    print(f'Epoch {epoch+1} Loss: {epoch_loss:.4f}')

In [None]:
# Test the fine-tuned model with a question
question = "How do I SSH into an external server?"
embedding = embedding_model.encode([question]).tolist()
result = index.query(
vector = embedding,
top_k=1,
include_metadata=True
)
context = str(result['matches'][0]['metadata']['text'])
input_text = f"question: {question} context: {context}"

input_ids = tokenizer(input_text, return_tensors="pt", max_length=MAX_SEQUENCE_LENGTH, padding=True)
answer = model.generate(input_ids['input_ids'], num_beams=4, max_length = MAX_GENERATION_LENGTH, early_stopping=True)

# Decode the answer
generated_answer = tokenizer.decode(answer[0], skip_special_tokens=True)

print(f"Generated Answer: {generated_answer}")

In [None]:
# Save the fine-tuned model
model.save_weights('fine_tuned_t5_weights')