In [None]:
!pip install keras-nlp -q
!pip install transformers
!pip install tensorflow --upgrade
!pip install sentence_transformers
!pip install pinecone-client
!pip install transformers

In [None]:
import keras_nlp
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
import time
import pandas as pd
from transformers import GPT2Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow import keras

policy = keras.mixed_precision.Policy("mixed_float16")
keras.mixed_precision.set_global_policy(policy)

In [None]:
import pinecone
#Connect to the vector database 
pinecone.init(api_key="c72b02c0-62fa-4ee2-aa36-6c2384fed7e1", environment="gcp-starter")
pinecone.list_indexes()
index = pinecone.Index("ubuntu-ir-jina")

In [None]:
from transformers import AutoModel
from numpy.linalg import norm
#Load pre-trained model for generating word embeddings 
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True) # trust_remote_code is needed to use the encode method

In [None]:
# General hyperparameters
BATCH_SIZE = 32
NUM_BATCHES = 500
EPOCHS = 1  # Can be set to a higher value for better results
MAX_SEQUENCE_LENGTH = 128
MAX_GENERATION_LENGTH = 200
EPOCHS = 1

GPT2_PRESET = "gpt2_base_en"

# LoRA-specific hyperparameters
RANK = 4
ALPHA = 32.0

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
csv_file_path = "/content/drive/My Drive/dialogue_pairs.csv"
ubuntu_dialogues = pd.read_csv(csv_file_path)

In [None]:
# Filter out rows with missing values
ubuntu_dialogues = ubuntu_dialogues.dropna(subset=["Question", "Answer"])
# Retrieve 1000 dialogue pairs 
ubuntu_dialogues = ubuntu_dialogues.head(1000)

text_list = []
#Retrieve topmost similar word embedding to query from vector db 
for _, row in ubuntu_dialogues.iterrows():
    question = row['Question']
    answer = row['Answer']
    embedding = model.encode([question]).tolist()
    result = index.query(
    vector = embedding,
    top_k=1,
    include_metadata=True
)
    context = str(result['matches'][0]['metadata']['text'])
    text_list.append(f"Context: {context} Question: {str(question)} Answer: {str(answer)}")


In [None]:
#Obtain model response to query 
def generate_text(model, input_text, max_length=200):
    start = time.time()

    output = model.generate(input_text, max_length=max_length)
    print("\nOutput:")
    print(output)

    end = time.time()
    print(f"Total Time Elapsed: {end - start:.2f}s")


In [None]:
def get_optimizer_and_loss():
    optimizer = keras.optimizers.AdamW(
        learning_rate=5e-5,
        weight_decay=0.01,
        epsilon=1e-6,
        global_clipnorm=1.0,  # Gradient clipping.
    )
    # Exclude layernorm and bias terms from weight decay.
    optimizer.exclude_from_weight_decay(var_names=["bias"])
    optimizer.exclude_from_weight_decay(var_names=["gamma"])
    optimizer.exclude_from_weight_decay(var_names=["beta"])

    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    return optimizer, loss

In [None]:
import math


class LoraLayer(keras.layers.Layer):
    def __init__(
        self,
        original_layer,
        rank=8,
        alpha=32,
        trainable=False,
        **kwargs,
    ):
        # We want to keep the name of this layer the same as the original
        # dense layer.
        original_layer_config = original_layer.get_config()
        name = original_layer_config["name"]

        kwargs.pop("name", None)

        super().__init__(name=name, trainable=trainable, **kwargs)

        self.rank = rank
        self.alpha = alpha

        self._scale = alpha / rank

        self._num_heads = original_layer_config["output_shape"][-2]
        self._hidden_dim = self._num_heads * original_layer_config["output_shape"][-1]

        # Layers.

        # Original dense layer.
        self.original_layer = original_layer
        # No matter whether we are training the model or are in inference mode,
        # this layer should be frozen.
        self.original_layer.trainable = False

        # LoRA dense layers.
        self.A = keras.layers.Dense(
            units=rank,
            use_bias=False,
            # Note: the original paper mentions that normal distribution was
            # used for initialization. However, the official LoRA implementation
            # uses "Kaiming/He Initialization".
            kernel_initializer=keras.initializers.VarianceScaling(
                scale=math.sqrt(5), mode="fan_in", distribution="uniform"
            ),
            trainable=trainable,
            name=f"lora_A",
        )
        # B has the same `equation` and `output_shape` as the original layer.
        # `equation = abc,cde->abde`, where `a`: batch size, `b`: sequence
        # length, `c`: `hidden_dim`, `d`: `num_heads`,
        # `e`: `hidden_dim//num_heads`. The only difference is that in layer `B`,
        # `c` represents `rank`.
        self.B = keras.layers.EinsumDense(
            equation=original_layer_config["equation"],
            output_shape=original_layer_config["output_shape"],
            kernel_initializer="zeros",
            trainable=trainable,
            name=f"lora_B",
        )

    def call(self, inputs):
        original_output = self.original_layer(inputs)
        if self.trainable:
            # If we are fine-tuning the model, we will add LoRA layers' output
            # to the original layer's output.
            lora_output = self.B(self.A(inputs)) * self._scale
            return original_output + lora_output

        # If we are in inference mode, we "merge" the LoRA layers' weights into
        # the original layer's weights - more on this in the text generation
        # section!
        return original_output


In [None]:
# This resets "peak" memory usage to "current" memory usage.
tf.config.experimental.reset_memory_stats("GPU:0")

# Load the original model.
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
lora_model = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en",
    preprocessor=preprocessor,
)

In [None]:
for layer_idx in range(lora_model.backbone.num_layers):
    # Change query dense layer.
    decoder_layer = lora_model.backbone.get_layer(f"transformer_layer_{layer_idx}")
    self_attention_layer = decoder_layer._self_attention_layer

    # Change query dense layer.
    self_attention_layer._query_dense = LoraLayer(
        self_attention_layer._query_dense,
        rank=RANK,
        alpha=ALPHA,
        trainable=True,
    )

    # Change value dense layer.
    self_attention_layer._value_dense = LoraLayer(
        self_attention_layer._value_dense,
        rank=RANK,
        alpha=ALPHA,
        trainable=True,
    )

In [None]:
lora_model(preprocessor(["LoRA is very useful for quick LLM finetuning"])[0])
pass

In [None]:
for layer in lora_model._flatten_layers():
    lst_of_sublayers = list(layer._flatten_layers())

    if len(lst_of_sublayers) == 1:  # "leaves of the model"
        if layer.name in ["lora_A", "lora_B"]:
            layer.trainable = True
        else:
            layer.trainable = False

In [None]:
lora_model.summary()

In [None]:
optimizer, loss = get_optimizer_and_loss()

lora_model.compile(
    optimizer=optimizer,
    loss=loss,
    weighted_metrics=["accuracy"],
)

In [None]:
lora_model.fit(
    text_list,
    epochs=1,
)

In [None]:
for layer_idx in range(lora_model.backbone.num_layers):
    self_attention_layer = lora_model.backbone.get_layer(
        f"transformer_layer_{layer_idx}"
    )._self_attention_layer

    # Merge query dense layer.
    query_lora_layer = self_attention_layer._query_dense

    A_weights = query_lora_layer.A.kernel  # (768, 1) (a, b)
    B_weights = query_lora_layer.B.kernel  # (1, 12, 64) (b, c, d)
    increment_weights = tf.einsum("ab,bcd->acd", A_weights, B_weights) * (ALPHA / RANK)
    query_lora_layer.original_layer.kernel.assign_add(increment_weights)

    # Merge value dense layer.
    value_lora_layer = self_attention_layer._value_dense

    A_weights = value_lora_layer.A.kernel  # (768, 1) (a, b)
    B_weights = value_lora_layer.B.kernel  # (1, 12, 64) (b, c, d)
    increment_weights = tf.einsum("ab,bcd->acd", A_weights, B_weights) * (ALPHA / RANK)
    value_lora_layer.original_layer.kernel.assign_add(increment_weights)

In [None]:
questions = ['How do I upgrade an ubuntu server?', 'How do I ssh into an external server?', 'How do I create a new user account?', 'How do I copy files over into an external server?', 'How do I reset a forgotten Ubuntu account password?', 'How do I install software on Ubuntu?', 'How do I check the hardware specifications of a server?', 'how can I delete a non-empty directory?', 'How do I edit the /etc/fstab file?', 'How do I edit a read-only file in Ubuntu?', 'How can I rename a file in the terminal?', 'How can I synchronize the time in Ubuntu?', 'How do I configure Samba as a file server?', 'What is the difference between the "mv" and "cp" commands?', 'How do I install Nvidia drivers on Ubuntu?']
queries = []

for question in questions:
    embedding = model.encode([question]).tolist()
    result = index.query(
    vector = embedding,
    top_k=1,
    include_metadata=True
  )
    context = str(result['matches'][0]['metadata']['text'])
    queries.append(f"Context: {context} Question: {str(question)} Answer:")

In [None]:
# Create a list to store generated texts
generated_texts = []

# Loop through each question not including context, generate the text, and append it to the list
for question in questions:
    generated_text = generate_text(lora_model, question)
    generated_texts.append(question)
    generated_texts.append(generated_text)

# Save the generated texts to a text file
with open('generated_texts.txt', 'w') as file:
    for text in generated_texts:
      if text:
        file.write(text + '\n')

# Download the text file from Google Colab
from google.colab import files
files.download('generated_texts.txt')

In [None]:
# Create a list to store generated texts
generated_texts = []

# Loop through each question also consisting of the context, generate the text, and append it to the list
for query in queries:
    generated_text = generate_text(lora_model, query)
    generated_texts.append(query.split('Question:')[1].split('Answer:')[0])
    generated_texts.append(generated_text)

# Save the generated texts to a text file
with open('generated_texts_context.txt', 'w') as file:
    for text in generated_texts:
      if text:
        file.write(text + '\n')

# Download the text file from Google Colab
from google.colab import files
files.download('generated_texts_context.txt')

In [None]:
lora_model.save()