In [1]:
!nvidia-smi

Wed Jan 17 04:50:27 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 5000     Off  | 00000000:00:05.0 Off |                  Off |
| 33%   27C    P0    41W / 230W |      1MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch

from transformers import BitsAndBytesConfig
from torch import bfloat16

# Our 4-bit configuration to load the LLM with less GPU memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

model_id = "Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id, revision='main', use_fast=True, trust_remote_code=True)
model_config = AutoConfig.from_pretrained(
    model_id
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [3]:
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]

stop_token_ids = [torch.LongTensor(x).to('cuda') for x in stop_token_ids]
stop_token_ids

[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [4]:
from transformers import StoppingCriteria, StoppingCriteriaList

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [5]:
from transformers import pipeline

generate_text = pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    stopping_criteria=stopping_criteria,
    temperature=0.1,
    max_new_tokens=512,
    repetition_penalty=1.1
)

In [11]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [6]:
from langchain.document_loaders import WebBaseLoader

web_links = [
    "https://cloud.google.com/architecture/big-data-analytics/analytics-lakehouse",
    "https://cloud.google.com/architecture/data-mesh",
    "https://cloud.google.com/architecture/design-self-service-data-platform-data-mesh",
    "https://cloud.google.com/architecture/describe-organize-data-products-resources-data-mesh",
    "https://cloud.google.com/architecture/build-data-products-data-mesh",
    "https://cloud.google.com/architecture/discover-consume-data-products-data-mesh",
    "https://cloud.google.com/architecture/big-data-analytics/data-warehouse",
    "https://cloud.google.com/architecture/marketing-data-warehouse-on-gcp",
    "https://cloud.google.com/architecture/cicd-pipeline-for-data-processing",
    "https://cloud.google.com/architecture/cicd-pipeline-for-data-processing/deployment",
    "https://cloud.google.com/architecture/automatically-apply-sensitivity-tags-in-data-catalog",
    "https://cloud.google.com/architecture/partners/building-custom-data-integrations-using-fivetran-and-cloud-functions",
    "https://cloud.google.com/architecture/partners/continuous-data-replication-bigquery-striim",
    "https://cloud.google.com/architecture/ingesting-clinical-and-operational-data-with-cloud-data-fusion",
    "https://cloud.google.com/architecture/performing-etl-from-relational-database-into-bigquery",
    "https://cloud.google.com/architecture/tracking-provenance-and-lineage-metadata-for-healthcare-data",
    "https://cloud.google.com/architecture/using-apache-hive-on-cloud-dataproc",
    "https://cloud.google.com/architecture/using-apache-hive-on-cloud-dataproc/deployment",
    "https://cloud.google.com/architecture/partners/using-fivetran-and-elt-with-bigquery",
    "https://cloud.google.com/architecture/data-pipeline-mongodb-gcp",
    "https://cloud.google.com/architecture/data-pipeline-mongodb-gcp/deployment",
    "https://cloud.google.com/architecture/analyzing-fhir-data-in-bigquery",
    "https://cloud.google.com/architecture/build-visualize-demand-forecast-prediction-datastream-dataflow-bigqueryml-looker",
    "https://cloud.google.com/architecture/reference-patterns/overview",
    "https://cloud.google.com/architecture/data-science-with-r-on-gcp-eda",
    "https://cloud.google.com/architecture/genomic-data-processing-reference-architecture",
    "https://cloud.google.com/architecture/geospatial-analytics-architecture",
    "https://cloud.google.com/architecture/propensity-modeling-gaming",
    "https://cloud.google.com/architecture/set-up-regulatory-reporting-architecture-bigquery",
    "https://cloud.google.com/architecture/build-smart-api-predict-customer-purchase-apigee-bigquery-ml-cloud-spanner",
    "https://cloud.google.com/architecture/build-smart-api-predict-customer-purchase-apigee-bigquery-ml-cloud-spanner/deployment",
] 

loader = WebBaseLoader(web_links)
documents = loader.load()

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

In [13]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [26]:
chat_history = []

query = "What I need to implement a data mesh architecture?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])



 You need to be familiar with the concepts described in the series, including architecture and functions in a data mesh, designing a self-service data platform, describing and organizing data products and resources, building data products, and discovering and consuming data products. Additionally, you should understand the key terms used in the architecture, such as data products, data governance standards, and distributed teams.


In [29]:
chat_history = [(query, result["answer"])]

query = "What GCP products we need?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])



  Of course! Google Cloud Dataflow is a fully managed service that makes it easy to process and analyze large amounts of data using Apache Beam. With Dataflow, you can create powerful data processing pipelines that can handle complex tasks like data transformation, filtering, and aggregation. Additionally, Dataflow provides built-in support for data validation, monitoring, and security, making it a great choice for organizations looking to streamline their data processing workflows. Would you like to learn more about how Dataflow can help with your organization's data processing needs?


In [30]:
import pprint
pprint.pprint(result['source_documents'])

[Document(page_content="What's next\n\nFind Google Dataflow Templates on\nGitHub \nif you want to customize the templates.\nLearn more about MongoDB Atlas and Google Cloud solutions on\nCloud skill boost.\nLearn more about the Google Cloud products used in this reference\narchitecture:\n\nBigQuery \nPub/Sub \nDataflow \nCompute Engine \n\n\n\nFor more reference architectures, diagrams, and best practices, explore the\nCloud Architecture Center.\n\n\n\nContributors\nAuthors:\n\nSaurabh Kumar | ISV\nPartner Engineer\nVenkatesh Shanbhag | Senior\nSolutions Architect (MongoDB)\n\nOther contributors:\n\nJun Liu | Supportability Tech\nLead\nMaridu Raju Makaraju | Supportability\nTech Lead\nSergei Lilichenko | Solutions\nArchitect\nShan Kulandaivel | Group Product Manager\n\nTo see nonpublic LinkedIn profiles, sign in to LinkedIn.\n\n\n\n\n\n\n  \n    \n    Send feedback", metadata={'source': 'https://cloud.google.com/architecture/data-pipeline-mongodb-gcp/deployment', 'title': 'Deploy a data