In [None]:
import torch

print("CUDA version used is : " + torch.version.cuda)
print("Total GPU memory is : " + str(torch.cuda.get_device_properties(0).total_memory / 1024**3) + " GB")
print("Total GPU memory used is : " + str(torch.cuda.memory_allocated(0) / 1024**3) + " GB")
print("Total GPU memory cached is : " + str(torch.cuda.memory_reserved(0) / 1024**3) + " GB")
print("Total GPU memory free is : " + str((torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / 1024**3) + " GB")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import gc
import torch
from transformers import BitsAndBytesConfig

access_token = "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"
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

cache_dir = ("./models")
os.makedirs(cache_dir, exist_ok=True)

torch.cuda.empty_cache()
gc.collect()

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

# Load the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    cache_dir=cache_dir, 
    token=access_token, 
    device_map="auto", 
    torch_dtype=torch.float16,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    cache_dir=cache_dir, 
    token=access_token
)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import gc
import torch
from transformers import BitsAndBytesConfig

access_token = "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"
model_id = "bofenghuang/vigogne-2-7b-instruct"

cache_dir = ("./models")
os.makedirs(cache_dir, exist_ok=True)

torch.cuda.empty_cache()
gc.collect()

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

# Load the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    cache_dir=cache_dir, 
    token=access_token, 
    device_map="auto", 
    torch_dtype=torch.float16,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    cache_dir=cache_dir, 
    token=access_token
)

In [None]:
input_text = r"""Explique en français qui est Thomas Pesquet et ce qu'il a fait"""

device ='cuda' if torch.cuda.is_available() else 'cpu'

input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

with torch.no_grad():
    output = model.generate(
        input_ids,
        max_length=2048 + len(input_text),
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        no_repeat_ngram_size=2,  # Prevent repetition of 2-grams
        temperature=0.7,         # Control the creativity of the model
        top_k=50,                # Top-k sampling
        top_p=0.9               # Nucleus sampling
    )

print(tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import gc
import torch
from transformers import BitsAndBytesConfig

access_token = "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"
model_id = "google/gemma-2b-it"

cache_dir = ("./models")
os.makedirs(cache_dir, exist_ok=True)

torch.cuda.empty_cache()
gc.collect()

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

# Load the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    cache_dir=cache_dir, 
    token=access_token, 
    device_map="auto", 
    torch_dtype=torch.float16,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    cache_dir=cache_dir, 
    token=access_token
)

In [1]:
from typing import Dict, List, Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TextStreamer

model_name_or_path = "bofenghuang/vigogne-2-7b-chat"
revision = "v2.0"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, revision=revision, padding_side="right", use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, revision=revision, torch_dtype=torch.float16, device_map="auto")

streamer = TextStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)


def chat(
    query: str,
    history: Optional[List[Dict]] = None,
    temperature: float = 0.7,
    top_p: float = 1.0,
    top_k: float = 0,
    repetition_penalty: float = 1.1,
    max_new_tokens: int = 1024,
    **kwargs,
):
    if history is None:
        history = []

    history.append({"role": "user", "content": query})

    input_ids = tokenizer.apply_chat_template(history, add_generation_prompt=True, return_tensors="pt").to(model.device)
    input_length = input_ids.shape[1]

    generated_outputs = model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(
            temperature=temperature,
            do_sample=temperature > 0.0,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            **kwargs,
        ),
        streamer=streamer,
        return_dict_in_generate=True,
    )

    generated_tokens = generated_outputs.sequences[0, input_length:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    history.append({"role": "assistant", "content": generated_text})

    return generated_text, history


# 1st round
response, history = chat("Un escargot parcourt 100 mètres en 5 heures. Quelle est sa vitesse ?", history=None)

# 2nd round
response, history = chat("Quand il peut dépasser le lapin ?", history=history)

# 3rd round
response, history = chat("Écris une histoire imaginative qui met en scène une compétition de course entre un escargot et un lapin.", history=history)


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]



La vitesse de l'escargot peut être calculée en divisant la distance parcourue (d) par le temps (t) :

vitesse = d 

KeyboardInterrupt: 