In [1]:
# Display NVIDIA System Management Interface information, showing the details of the available GPU(s).
! nvidia-smi

Sat Jun 15 16:02:06 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# Install necessary Python packages for working with large language models and transformers.
! pip install -q langchain-community transformers accelerate bitsandbytes einops

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.6/315.6 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [None]:
# Login to Hugging Face to access models that may require authentication.
! huggingface-cli login

In [4]:
# Import necessary libraries from LangChain and Hugging Face.
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer

import torch
import transformers

In [5]:
# Specify the model to be used; in this case, Meta's LLaMA 2 7B parameter model.
model = "meta-llama/Llama-2-7b-chat-hf"

In [None]:
# Load the tokenizer associated with the specified model.
tokenizer = AutoTokenizer.from_pretrained(model)

In [7]:
# Create a text generation pipeline using the specified model and tokenizer.
pipeline = transformers.pipeline(
    "text-generation", # Task: text generation
    model=model, # The model to use
    tokenizer=tokenizer,  # The tokenizer to use
    torch_dtype=torch.bfloat16, # Use bfloat16 precision
    trust_remote_code=True, # Trust code from remote repositories
    device_map="auto", # Automatically select the device to use (CPU/GPU)
    max_length=1024, # Maximum length of generated text
    do_sample=True,  # Use sampling; necessary for generating diverse outputs
    top_k=10, # Number of highest probability vocabulary tokens to keep for top-k-filtering
    num_return_sequences=1,  # Number of returned sequences
    eos_token_id=tokenizer.eos_token_id,  # End-of-sequence token ID
    truncation=True # Enable truncation
)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [8]:
# Create an instance of HuggingFacePipeline for use with LangChain, setting the temperature for randomness in output.
llm = HuggingFacePipeline(pipeline=pipeline,model_kwargs={"temperature":0.9})

  warn_deprecated(


In [9]:
# Define the query to be sent to the model.
query = "tell me about large language models"

# Generate and print the response from the model.
print(llm.invoke(query))

tell me about large language models?

Large language models are artificial intelligence (AI) models that are trained on vast amounts of text data to generate language outputs that are coherent and natural-sounding. These models have become increasingly popular in recent years due to their ability to generate text that is often indistinguishable from human-written text. In this answer, we will explore how large language models work, their potential applications, and some of the challenges and limitations associated with them.

How do large language models work?

Large language models are based on a type of AI called deep learning, which involves training artificial neural networks to perform specific tasks. In the case of language models, the neural networks are trained on vast amounts of text data, such as books, articles, and websites. The networks learn to predict the next word in a sequence of text given the previous words, and this process is repeated millions of times until the ne