### Lets start learning about the Opensource models available on Hugging Face platform

In [1]:
## First login to HF
from huggingface_hub import login
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

login(hf_token, add_to_git_credential=True)

#### Install some misisng libraries

In [2]:
!pip install -q --upgrade bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h

#### Lets Look into some Instruct / Chat LLM Models

In [3]:
LLAMA = "meta-llama/Llama-3.2-1B-Instruct"
PHI = "microsoft/Phi-4-mini-instruct"
QWEN = "Qwen/Qwen3-4B-Instruct-2507"
DEEPSEEK = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

#### We will try to load the models in 4bit Quantized form for saving memory. Lets create quantization config for same

In [4]:
import torch
from transformers import BitsAndBytesConfig

quant_conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
message = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell a joke for a room full of Data Scientists"},
]

### LLAMA

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [15]:
# Tokenizer for Llama
llama_tokenizer = AutoTokenizer.from_pretrained(LLAMA)
llama_tokenizer.pad_token = llama_tokenizer.eos_token

# Convert the message to prompt and generate the imput for model
input = llama_tokenizer.apply_chat_template(message, return_tensors="pt").to("cuda")
input

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   2437,   4723,    220,   2366,     20,    271,   2675,    527,
            264,  11190,  18328,     13, 128009, 128006,    882, 128007,    271,
          41551,    264,  22380,    369,    264,   3130,   2539,    315,   2956,
          57116, 128009]], device='cuda:0')

In [11]:
llama_model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_conf)
llama_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm

In [16]:
output = llama_model.generate(input, max_new_tokens=80)
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   2437,   4723,    220,   2366,     20,    271,   2675,    527,
            264,  11190,  18328,     13, 128009, 128006,    882, 128007,    271,
          41551,    264,  22380,    369,    264,   3130,   2539,    315,   2956,
          57116, 128009, 128006,  78191, 128007,    271,   8586,    596,    264,
            828,  32505,  22380,    369,    264,   3130,   2539,    315,   2956,
          57116,   1473,  10445,   1550,    279,  10550,    733,    311,  15419,
           1980,  18433,    433,    574,   8430,    264,   2697,    330,  99323,
              1,    323,   1047,    264,   2763,    315,    330,  31716,      1,
            828,      0,    320,    456,    433,     30,   1093,    264,  10550,
             11,    719,   1101,    264,   1514,    389,   4339,    696,     40,
           3987,    420,  22

In [17]:
## Lest see output by decoding with tokenizer
llama_tokenizer.batch_decode(output)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 02 Nov 2025\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTell a joke for a room full of Data Scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHere\'s a data-driven joke for a room full of Data Scientists:\n\nWhy did the dataset go to therapy?\n\nBecause it was feeling a little "sparse" and had a lot of "missing" data! (get it? like a dataset, but also a play on words)\n\nI hope this joke "folds" a smile on your faces!<|eot_id|>']

In [18]:
### Cleanup the variables to free memory
del llama_model
del llama_tokenizer
del input

### Lets create a function to see all the models in action


In [7]:
from transformers import TextStreamer

def gen_llm(model: str, message: list, quant: bool=True, max_new_tokens: int = 80):
  tokenizer = AutoTokenizer.from_pretrained(model)
  tokenizer.pad_token = tokenizer.eos_token

  # Text Streamer for streaming output
  streamer = TextStreamer(tokenizer)

  # Input Prompt
  input = tokenizer.apply_chat_template(message, return_tensors="pt").to("cuda")

  # Load Model
  if quant:
    model = AutoModelForCausalLM.from_pretrained(model, quantization_config=quant_conf).to("cuda")
  else:
    model = AutoModelForCausalLM.from_pretrained(model).to("cuda")

  # Generate Response
  output = model.generate(input, max_new_tokens=max_new_tokens, streamer = streamer)

In [20]:
gen_llm(PHI, message)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

<|system|>You are a helpful assistant.<|end|><|user|>Tell a joke for a room full of Data Scientists<|end|><|endoftext|><reponame>michaelbarron/portfolio
import { expect } from 'chai';
import { expect } from 'chai';
import { expect } from 'chai';
import { expect } from 'chai';
import { expect } from 'chai';
import { expect } from 'chai';
import { expect } from 'chai';
import { expect } from 'chai';
import { expect


In [21]:
gen_llm(QWEN, message)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Tell a joke for a room full of Data Scientists<|im_end|>
<|im_start|>Assistant 😄

Sure! Here's a *data scientist-approved* joke for a room full of stats wizards, ML engineers, and model validators:

---

**Why did the data scientist break up with the machine learning model?**  

Because it was *always* trying to predict the future…  
and never *actually* knew the past. 😂

*(Bonus points if you added:


In [9]:
gen_llm(DEEPSEEK, message, quant=False, max_new_tokens=500)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>You are a helpful assistant.<｜User｜>Tell a joke for a room full of Data Scientists. They have a lot of data, but they don't want to be evaluated. So, maybe a joke about data privacy or security? Or something related to the data being processed without being monitored or monitored?
</think>

Data scientists, don't worry, we're safe!<｜end▁of▁sentence｜>
