In [1]:
import torch

In [2]:
import langchain

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings



In [4]:
EMB_INSTRUCTOR_XL = "hkunlp/instructor-xl"
EMB_SBERT_MPNET_BASE = "sentence-transformers/all-mpnet-base-v2"



In [5]:
LLM_FLAN_T5_XXL = "google/flan-t5-xxl"
LLM_FLAN_T5_XL = "google/flan-t5-xl"
LLM_FASTCHAT_T5_XL = "lmsys/fastchat-t5-3b-v1.0"
LLM_FLAN_T5_SMALL = "google/flan-t5-small"
LLM_FLAN_T5_BASE = "google/flan-t5-base"
LLM_FLAN_T5_LARGE = "google/flan-t5-large"
LLM_FALCON_7B_INSTRUCT = "tiiuae/falcon-7b-instruct"
LLM_FALCON_40B_INSTRUCT = "tiiuae/falcon-40b-instruct"
LLM_FALCON_7B = "tiiuae/falcon-7b"
LLM_FALCON_40B = "tiiuae/falcon-40b"
LLM_LLAMA2_70B_INSTRUCT = "upstage/Llama-2-70b-instruct"

In [6]:
cache_dir='/work/rc/projects/chatbot/models'



In [7]:
config = {"persist_directory":None,
          "load_in_8bit":False,
          "embedding" : EMB_SBERT_MPNET_BASE,
          "llm":LLM_FALCON_40B_INSTRUCT,
          }

In [8]:
    
import os
os.environ['TRANSFORMERS_CACHE'] = '/work/rc/projects/chatbot/models'
#cache_folder=os.getenv('SENTENCE_TRANSFORMERS_HOME')
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/work/rc/projects/chatbot/models'

In [9]:
from transformers import AutoTokenizer
from transformers import pipeline

2023-10-14 03:04:40.671121: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
def create_llama2_70b_instruct(load_in_8bit=True):
        model = LLM_LLAMA2_70B_INSTRUCT

        tokenizer = AutoTokenizer.from_pretrained(model , cache_dir=cache_dir)
        hf_pipeline = pipeline(
                task="text-generation",
                model = model,
                do_sample=True, #Whether or not to use sampling ; use greedy decoding otherwise.
                tokenizer = tokenizer,
                #trust_remote_code = True,
                max_new_tokens=500, #The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
                #cache_dir=cache_dir,
                model_kwargs={
                    "device_map": "auto", 
                    "load_in_8bit": load_in_8bit, 
                    "max_length": 512, 
                    "temperature": 0.01,
                    
                    "torch_dtype":torch.bfloat16,
                    }
            )
        return hf_pipeline





In [11]:
def create_falcon_40b(load_in_8bit=True):
        model = LLM_FALCON_40B

        tokenizer = AutoTokenizer.from_pretrained(model , cache_dir=cache_dir)
        hf_pipeline = pipeline(
                task="text-generation",
                model = model,
                do_sample=True, #Whether or not to use sampling ; use greedy decoding otherwise.
                tokenizer = tokenizer,
                #trust_remote_code = True,
                max_new_tokens=500, #The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
                #cache_dir=cache_dir,
                model_kwargs={
                    "device_map": "auto", 
                    "load_in_8bit": load_in_8bit, 
                    "max_length": 512, 
                    "temperature": 0.01,
                    
                    "torch_dtype":torch.bfloat16,
                    }
            )
        return hf_pipeline




In [12]:
def create_falcon_40b_instruct(load_in_8bit=True):
        model = LLM_FALCON_40B_INSTRUCT

        tokenizer = AutoTokenizer.from_pretrained(model , cache_dir=cache_dir)
        hf_pipeline = pipeline(
                task="text-generation",
                model = model,
                do_sample=True, #Whether or not to use sampling ; use greedy decoding otherwise.
                tokenizer = tokenizer,
                #trust_remote_code = True,
                max_new_tokens=100, #The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
                #cache_dir=cache_dir,
                model_kwargs={
                    "device_map": "auto", 
                    "load_in_8bit": load_in_8bit, 
                    "max_length": 512, 
                    "temperature": 0.01,
                    
                    "torch_dtype":torch.bfloat16,
                    }
            )
        return hf_pipeline



In [13]:
def create_falcon_7b(load_in_8bit=True):
        model = LLM_FALCON_7B

        tokenizer = AutoTokenizer.from_pretrained(model , cache_dir=cache_dir)
        hf_pipeline = pipeline(
                task="text-generation",
                model = model,
                do_sample=True, #Whether or not to use sampling ; use greedy decoding otherwise.
                tokenizer = tokenizer,
                #trust_remote_code = True,
                max_new_tokens=100, #The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
                #cache_dir=cache_dir,
                model_kwargs={
                    "device_map": "auto", 
                    "load_in_8bit": load_in_8bit, 
                    "max_length": 512, 
                    "temperature": 0.01,
                    
                    "torch_dtype":torch.bfloat16,
                    }
            )
        return hf_pipeline





In [14]:
def create_falcon_7b_instruct(load_in_8bit=True):
        model = LLM_FALCON_7B_INSTRUCT

        tokenizer = AutoTokenizer.from_pretrained(model , cache_dir=cache_dir)
        hf_pipeline = pipeline(
                task="text-generation",
                model = model,
                do_sample=True, #Whether or not to use sampling ; use greedy decoding otherwise.
                tokenizer = tokenizer,
                #trust_remote_code = True,
                max_new_tokens=500, #The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
                #cache_dir=cache_dir,
                model_kwargs={
                    "device_map": "auto", 
                    "load_in_8bit": load_in_8bit, 
                    "max_length": 512, 
                    "temperature": 0.01,
                    
                    "torch_dtype":torch.bfloat16,
                    }
            )
        return hf_pipeline




In [15]:

def create_flan_t5_base(load_in_8bit=True):
        # Wrap it in HF pipeline for use with LangChain
        model="google/flan-t5-base"
        tokenizer = AutoTokenizer.from_pretrained(model, cache_dir=cache_dir)
        return pipeline(
            task="text2text-generation",
            model=model,
            tokenizer = tokenizer,
            max_new_tokens=100,
            model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.}
        )
        

In [16]:
load_in_8bit = config["load_in_8bit"]
if config["llm"] == LLM_FLAN_T5_BASE:
    llm = create_flan_t5_base(load_in_8bit=load_in_8bit)

In [17]:
load_in_8bit = config["load_in_8bit"]

if config["llm"] == LLM_FALCON_40B:
    llm = create_falcon_40b(load_in_8bit=load_in_8bit)
    

In [18]:
load_in_8bit = config["load_in_8bit"]

if config["llm"] == LLM_FALCON_40B_INSTRUCT:
    llm = create_falcon_40b_instruct(load_in_8bit=load_in_8bit)
    



A Jupyter Widget

In [19]:
load_in_8bit = config["load_in_8bit"]

if config["llm"] == LLM_FALCON_7B_INSTRUCT:
    llm = create_falcon_7b_instruct(load_in_8bit=load_in_8bit)
    

In [20]:
load_in_8bit = config["load_in_8bit"]

if config["llm"] == LLM_FALCON_7B:
    llm = create_falcon_7b(load_in_8bit=load_in_8bit)
    





In [21]:
load_in_8bit = config["load_in_8bit"]

if config["llm"] == LLM_LLAMA2_70B_INSTRUCT:
    llm = create_llama2_70b_instruct(load_in_8bit=load_in_8bit)
    




In [22]:
from langchain import HuggingFacePipeline



hf_llm = HuggingFacePipeline(pipeline = llm, model_kwargs = {'temperature':0})



In [23]:
from langchain import PromptTemplate,  LLMChain

template = """
<Instructions>
Important:
You are an intelligent chatbot. Answer the question with the facts listed in Content below. If there isn't enough information below, say you don't know.
ALWAYS return a "SOURCES" part in your answer, except for small-talk conversations.

Question: {question}
Content:   

Job Scheduling Policies and Priorities#
In an HPC environment, efficient job scheduling is crucial for allocating computing resources and ensuring optimal cluster utilization. Job scheduling policies and priorities determine the order in which jobs are executed and the resources they receive. Understanding these policies is essential for maximizing job efficiency and minimizing wait times.

Scheduling Policies#

FIFO (First-In-First-Out)#
Jobs are executed in the order they are submitted. Although simple, this policy may lead to long wait times for large, resource-intensive jobs if smaller jobs are constantly being submitted.


Fair Share#
This policy ensures that all users receive a fair share of cluster resources over time. Users with high resource usage may experience reduced priority, allowing others to access resources more regularly.


Priority-Based#
Jobs are assigned priorities based on user-defined criteria or system-wide rules. Higher-priority jobs are executed before lower-priority ones, allowing for resource allocation based on user requirements.



Job Priorities#

User Priority#
Users can assign priority values to their jobs. Higher values result in increased job priority and faster access to resources.


Resource Requirements#
Jobs with larger resource requirements may be assigned higher priority, as they require more significant resources to execute efficiently.


Walltime Limit#
Source: https://rc-docs.northeastern.edu/en/latest/runningjobs/jobscheduling.html

"""
prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = LLMChain(prompt=prompt, llm=hf_llm)

question = "What is the Scheduling Policies for HPC cluster?" 

#print(llm_chain.run(question))
print("done")

done


In [24]:
import time
start_time = time.time()

print(llm_chain.run(question))

end_time = time.time()
elapsed_time = end_time - start_time
print("Time take : " , elapsed_time)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call `model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations.

KeyboardInterrupt

