In [None]:
import numpy as np
import pandas as pd
import nltk

In [None]:
#Downloading the Guteberg corpus for use in the initial, barebones pipeline
#Only 18 stories
nltk.download('gutenberg')
from nltk.corpus import gutenberg

files = gutenberg.fileids()
texts = [gutenberg.raw(fileid) for fileid in files]


In [None]:
print(len(files))
print(len(texts[0]))

print(texts[0][:100])

In [None]:
#Chunking the texts, initially we will do from scratch

#1. First tokenize
def tokenize(text):
    return text.split()

tokenized = []
for text in texts:
    tokenized += tokenize(text)

#print(len(tokenized))
#print(len(tokenized[0]))
#print(tokenized[0][:10])


#2. Now we chunk the tokens
def chunk(tokens, size, overlap): #Overlap to deal with boundary problem
    chunks = []
    stride = size - overlap
    for i in range(0, len(tokens)-size, stride):
        if i+size > len(tokens):
            chunks.append(tokens[i:])
        else:
            chunks.append(tokens[i:i+size])
    
    return chunks

chunked = chunk(tokenized, 300, 50)

print(len(chunked))
print(len(chunked[0]))

In [None]:
#Create embeddings from the chunks
from sentence_transformers import SentenceTransformer
import tqdm

model = SentenceTransformer('all-MiniLM-L6-v2')

def embed(chunks):
    texts = [" ".join(chunk) for chunk in chunks]
    embeddings = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, batch_size=True, show_progress_bar=True)
    return embeddings

embeddings = embed(chunked)


print(len(embeddings))
print(len(embeddings[0]))


In [None]:
#Build Vector Index
import faiss

def create_vector_index(embeddings):
    D = embeddings.shape[1]
    vector_index = faiss.IndexFlatIP(D)
    vector_index.add(embeddings)
    return vector_index

vector_index = create_vector_index(embeddings)

#print(vector_index.ntotal)


def query_vector_index(vector_index, query, k, chunks):
    query_embedding = embed([query])
    distances, indices = vector_index.search(query_embedding, k)
    retrieved_chunks = [chunks[i] for i in indices[0]]
    return retrieved_chunks


#query = "The quick brown fox jumps over the lazy dog."
#retrieved_chunks = query_vector_index(vector_index, query, 5, chunked)
#for chunk in retrieved_chunks:
#    print(chunk)


In [None]:
#Construct the prompt that is sent to the LLM to answer the question

def construct_prompt(query, retrieved_chunks):
    prompt = "Use the following chunks to answer the question: " + query + "\n"
    for i in range(len(retrieved_chunks)):
        chunk = " ".join(retrieved_chunks[i])
        prompt += chunk + "\n"
    return prompt

query = "The quick brown fox jumps over the lazy dog."
retrieved_chunks = query_vector_index(vector_index, query, 5, chunked)
prompt = construct_prompt(query, retrieved_chunks)

print(prompt)


In [None]:
#Load the LLM

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
import accelerate


print("Finished Importing")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,                            # or load_in_8bit=True
    bnb_4bit_compute_dtype=torch.float16,         # float16 compute
    bnb_4bit_quant_type="nf4",                    # or "fp4", "fp4-dq"
    bnb_4bit_use_double_quant=True                # enable double quantization
)


#model_name = "distilgpt2"
model_name = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
"""
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config, #Load quantized model due to hardward constraints
    device_map="auto",
    low_cpu_mem_usage=True 
)
"""
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,     # fp16 weights
    device_map="auto",             # places layers on MPS automatically
    low_cpu_mem_usage=True,
    offload_folder="hf_offload",   # offload large layers to disk
)


print("Finished Loading Model")

llm = pipeline('text-generation', model=model, tokenizer=tokenizer)

print("Finished setting up pipeline ... now querying")

response = llm(prompt, max_new_tokens=256, return_full_text=False)

print("Recieved Response: \n")

print(response[0]['generated_text'])

In [None]:
response = llm("say hi", max_length=1000)
print(response[0]['generated_text'])

In [None]:
from huggingface_hub import snapshot_download
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
import accelerate

model_name = "tiiuae/falcon-7b-instruct"

local_dir = snapshot_download(
    repo_id="tiiuae/falcon-7b-instruct",
    cache_dir="hf_cache",
    local_files_only=False,
)
model = AutoModelForCausalLM.from_pretrained(
    local_dir,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    offload_folder="hf_offload"
)
tokenizer = AutoTokenizer.from_pretrained(local_dir, trust_remote_code=True)

llm = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [None]:
#print the tokenized phrase "hi"
tokens = print(tokenizer(prompt, return_tensors="pt").input_ids)
response = model.generate(
    **tokenizer(prompt, return_tensors="pt"),
    max_new_tokens=256
)

In [None]:
print("Finished Loading Model")

#llm = pipeline('text-generation', model=model, tokenizer=tokenizer)

print("Finished setting up pipeline ... now querying")

#response = llm(prompt, max_new_tokens=256, return_full_text=False)
response = llm("hi", max_new_tokens=256, return_full_text=False)

print("Recieved Response: \n")

print(response[0]['generated_text'])

In [None]:
print(prompt)