In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
#Downloading the Guteberg corpus for use in the initial, barebones pipeline
#Only 18 stories
nltk.download('gutenberg')
from nltk.corpus import gutenberg

files = gutenberg.fileids()
texts = [gutenberg.raw(fileid) for fileid in files]


[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/shanejayasundera/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
print(len(files))
print(len(texts[0]))

print(texts[0][:100])

18
887071
[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a


In [4]:
#Chunking the texts, initially we will do from scratch

#1. First tokenize
def tokenize(text):
    return text.split()

tokenized = []
for text in texts:
    tokenized += tokenize(text)

#print(len(tokenized))
#print(len(tokenized[0]))
#print(tokenized[0][:10])


#2. Now we chunk the tokens
def chunk(tokens, size, overlap): #Overlap to deal with boundary problem
    chunks = []
    for i in range(0, len(tokens)-size, overlap):
        if i+size > len(tokens):
            chunks.append(tokens[i:])
        else:
            chunks.append(tokens[i:i+size])
    
    return chunks

chunked = chunk(tokenized, 300, 50)

print(len(chunked))
print(len(chunked[0]))

42699
300


In [5]:
#Create embeddings from the chunks
from sentence_transformers import SentenceTransformer
import tqdm

model = SentenceTransformer('all-MiniLM-L6-v2')

def embed(chunks):
    embeddings = model.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)
    return embeddings

embeddings = embed(chunked)


print(len(embeddings))
print(len(embeddings[0]))


  from .autonotebook import tqdm as notebook_tqdm
  return forward_call(*args, **kwargs)


42699
384


In [6]:
#Build Vector Index
import faiss

def create_vector_index(embeddings):
    D = embeddings.shape[1]
    vector_index = faiss.IndexFlatIP(D)
    vector_index.add(embeddings)
    return vector_index

vector_index = create_vector_index(embeddings)

#print(vector_index.ntotal)


def query_vector_index(vector_index, query, k, chunks):
    query_embedding = embed([query])
    distances, indices = vector_index.search(query_embedding, k)
    retrieved_chunks = [chunks[i] for i in indices[0]]
    return retrieved_chunks


#query = "The quick brown fox jumps over the lazy dog."
#retrieved_chunks = query_vector_index(vector_index, query, 5, chunked)
#for chunk in retrieved_chunks:
#    print(chunk)


In [7]:
#Construct the prompt that is sent to the LLM to answer the question

def construct_prompt(query, retrieved_chunks):
    prompt = "Use the following chunks to answer the question: " + query + "\n"
    for i in range(len(retrieved_chunks)):
        chunk = " ".join(retrieved_chunks[i])
        prompt += chunk + "\n"
    return prompt

query = "The quick brown fox jumps over the lazy dog."
retrieved_chunks = query_vector_index(vector_index, query, 5, chunked)
prompt = construct_prompt(query, retrieved_chunks)

print(prompt)


Use the following chunks to answer the question: The quick brown fox jumps over the lazy dog.
the fox was, and the fox greeted them, and said, "The tailor and his wife were here this morning, and if you will loose me, I am swifter than you, and I will follow them, and overtake them." They therefore set the fox free. The lion and the fox and the army of Dublin went on then, trying to catch the tailor, and they kept going until they came to the place where the old white garraun was, and the old white garraun told them that the tailor and his wife were there in the morning, and "Loose me," said he; "I am swifter than you, and I'll overtake them." They released the old white garraun then, and the old white garraun, the fox, the lion, and the army of Dublin pursued the tailor and his wife, and it was not long before they came up with them. When the tailor saw them coming, he got out of the coach with his wife, and he sat down on the ground. When the old white garraun saw the tailor sitting 

In [None]:
#Load the LLM

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
import accelerate


print("Finished Importing")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,                            # or load_in_8bit=True
    bnb_4bit_compute_dtype=torch.float16,         # float16 compute
    bnb_4bit_quant_type="nf4",                    # or "fp4", "fp4-dq"
    bnb_4bit_use_double_quant=True                # enable double quantization
)


#model_name = "distilgpt2"
model_name = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
"""
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config, #Load quantized model due to hardward constraints
    device_map="auto",
    low_cpu_mem_usage=True
)
"""
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,     # fp16 weights
    device_map="auto",             # places layers on MPS automatically
    low_cpu_mem_usage=True,
    offload_folder="hf_offload",   # offload large layers to disk
)


print("Finished Loading Model")

llm = pipeline('text-generation', model=model, tokenizer=tokenizer)

print("Finished setting up pipeline ... now querying")

response = llm(prompt, max_new_tokens=256, return_full_text=False)

print("Recieved Response: \n")

print(response[0]['generated_text'])

Finished Importing


OSError: tiiuae/falcon-7b-instruct-optimized is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [None]:
response = llm("say hi", max_length=1000)
print(response[0]['generated_text'])

In [None]:
from huggingface_hub import snapshot_download
from tqdm.notebook import tqdm

local_dir = snapshot_download(
    repo_id="tiiuae/falcon-7b-instruct",
    cache_dir="hf_cache",
    local_files_only=False,
)
model = AutoModelForCausalLM.from_pretrained(local_dir, torch_dtype=torch.float16,
                                             device_map="auto", low_cpu_mem_usage=True,
                                             offload_folder="hf_offload")
tokenizer = AutoTokenizer.from_pretrained(local_dir)

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  22%|##1       | 2.78G/12.7G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:  37%|###6      | 2.59G/7.07G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:  20%|##        | 2.50G/12.4G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:  41%|####1     | 3.12G/7.61G [00:00<?, ?B/s]

weight.bin:   9%|9         | 2.90G/30.6G [00:00<?, ?B/s]