<a href="https://colab.research.google.com/github/scottmishra/FalconLLM_Training/blob/main/FalconLLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes

In [None]:
!nvidia-smi

In [None]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline

import torch

model = "tiiuae/falcon-7b-instruct" ## this will pull from hugging face, based on the model card

tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = pipeline(
    "text-generation", ## Task of the pipeline
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto", #manages cpu and gpu memory, comes from accelerate
    max_length=200, #output token limit
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

## running this cell will pull the model down on to the colab, and takes
## about 15 GB for the 7b model along with the checkpoints
## the colab download speed helps this take about 2 mins, but
## it can take longer if you were to run this on your own machine

In [None]:
## model_kwargs allows use to adjust the setup of the model a bit
## At this point the LLM is now in GPU memory
llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs = {'temperature':0})

In [None]:
## This takes just a few seconds and shows the simplest way to pull in the model
llm("What is capital of India?")

In [None]:
from langchain import PromptTemplate, LLMChain

template="""
You are an intelligent chatbot. Provide a truthful answer to the following question.
Questions: {question}
Answer:"""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
question = "Example why the Chicago Cubs are so beloved in a nursery rhyme"

print(llm_chain.run(question))

# Starting a QLoRA fine tuning approach