In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# RAG

In [None]:
!pip install -q langchain sentence-transformers langchain-community

In [None]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [None]:
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"  
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
data = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)

In [None]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings_model = HuggingFaceEmbeddings(
    model_name=modelPath,
    model_kwargs=model_kwargs, 
    encode_kwargs=encode_kwargs 
)

In [None]:
embeddings_model.embed_query("hello")

In [None]:
!pip install faiss-cpu faiss-gpu

In [None]:
db = FAISS.from_documents(docs, embeddings)

In [None]:
question = "what is python"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

In [None]:
db.save_local("MyRAG")

In [None]:
new_db = FAISS.load_local("MyRAG", embeddings, allow_dangerous_deserialization=True)

In [None]:
question = "what is python"
searchDocs = new_db.similarity_search(question)[0].page_content
print(searchDocs[0].page_content)

# understand images

In [None]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import requests
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## quantize

In [None]:
!pip install -q -U accelerate bitsandbytes

In [None]:
model_id = "google/paligemma-3b-mix-224"

In [None]:
from transformers import BitsAndBytesConfig
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, 
                                                          torch_dtype=torch.bfloat16,
                                                          quantization_config=nf4_config, 
                                                          )
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
model.push_to_hub("Arthur-LAGACHERIE/PaliGemma-4bit")

In [None]:
processor.push_to_hub("Arthur-LAGACHERIE/PaliGemma-4bit")

## use

In [None]:
!pip install -q bitsandbytes accelerate

In [None]:
model_id = "Arthur-LAGACHERIE/PaliGemma-4bit"

In [None]:
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw)

prompt = "describe the photo"
model_inputs = processor(text=prompt, images=image, return_tensors="pt")
input_len = model_inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]
    decoded = processor.decode(generation, skip_special_tokens=True)
    print(decoded)

## quantize gemma

In [None]:
model_id = "google/gemma-2-2b-it"

In [None]:
from transformers import BitsAndBytesConfig
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model = AutoModelForCausalLM.from_pretrained( model_id, 
                                              torch_dtype=torch.bfloat16,
                                              quantization_config=nf4_config, 
                                              )

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
model.push_to_hub("Arthur-LAGACHERIE/Gemma-2-2b-4bit")
tokenizer.push_to_hub("Arthur-LAGACHERIE/Gemma-2-2b-4bit")

# sdxl

In [None]:
!pip install diffusers

In [None]:
import torch
from diffusers import StableDiffusionPipeline
from diffusers import DPMSolverMultistepScheduler
import random

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model_id = "runwayml/stable-diffusion-v1-5"

pipe = StableDiffusionPipeline.from_single_file("https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors")

pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)

In [None]:
prompt = "a lion on the beach"
generator = torch.Generator(device).manual_seed(random.randint(0, 1500))

In [None]:
image = pipe(prompt, generator=generator, num_inference_steps=50, num_images_per_prompt=4).images                                                                                                                                                                                
image

In [None]:
image[0]

# audio parler

In [None]:
!pip install -q git+https://github.com/huggingface/parler-tts.git

In [None]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
from transformers import AutoTokenizer
from threading import Thread
import IPython
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(device, dtype=torch.float16)

In [None]:
prompt = "hello, how are you? I'm fine thank you, and you?"
description = "A man speaker"

input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().float().numpy().squeeze()
IPython.display.Audio(audio_arr, rate=model.config.sampling_rate)

# audio tortoise tts

In [None]:
!pip install gTTS

In [26]:
from gtts import gTTS
import IPython
tts = gTTS('hello, how are you? I am fine thank you')

In [27]:
for i in tts.stream():
    IPython.display.display(IPython.display.Audio(i, autoplay=True))

# whisper

In [1]:
!sudo apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'libasound2-dev' instead of 'libasound-dev'
The following additional packages will be installed:
  libjack-dev libjack0 libuuid1 uuid-dev
Suggested packages:
  libasound2-doc jackd1 portaudio19-doc
Recommended packages:
  uuid-runtime
The following packages will be REMOVED:
  libjack-jackd2-0
The following NEW packages will be installed:
  libasound2-dev libjack-dev libjack0 libportaudio2 libportaudiocpp0
  portaudio19-dev uuid-dev
The following packages will be upgraded:
  libuuid1
1 upgraded, 7 newly installed, 1 to remove and 68 not upgraded.
Need to get 645 kB of archives.
After this operation, 3340 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 libjack0 amd64 1:0.125.0-3build2 [93.3 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 libuuid1 amd64 2.34-0.1ubuntu9.6 [20.0 kB]
Get:3 http://archive.ubun

In [2]:
!pip install pyaudio keyboard

Collecting pyaudio
  Downloading PyAudio-0.2.14.tar.gz (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting keyboard
  Downloading keyboard-0.13.5-py3-none-any.whl.metadata (4.0 kB)
Downloading keyboard-0.13.5-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.1/58.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyaudio
  Building wheel for pyaudio (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pyaudio: filename=PyAudio-0.2.14-cp310-cp310-linux_x86_64.whl size=27851 sha256=764893d3e7a4b105f38b20aa960bdfa241ca27b4e2dcef4b0ce1c06e7a4b486d
  Stored in directory: /root/.cache/pip/wheels/d6/21/f4/0b51d41ba79e51b16295cbb096ec49f334792814d5

In [9]:
!pip install recorder

Collecting recorder
  Downloading recorder-0.0.2.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: recorder
  Building wheel for recorder (setup.py) ... [?25ldone
[?25h  Created wheel for recorder: filename=recorder-0.0.2-py3-none-any.whl size=3330 sha256=6ee392ccf3b121d16482689c939aac9b7927d1b8b7fbc2c5b2232c8020e1b717
  Stored in directory: /root/.cache/pip/wheels/2a/cc/34/15787083f8e19b106ba25f0aa0dcbe7fcc09956837a4a9ed25
Successfully built recorder
Installing collected packages: recorder
Successfully installed recorder-0.0.2


In [None]:
from recorder import Recorder
r = Recorder(input_device_index=0)
r.record(5, output='out.mp3')

In [2]:
import IPython
IPython.display.display(IPython.display.Audio("out.wav", autoplay=True))

In [2]:
import pyaudio
import wave

In [None]:
import pyaudio
import wave

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "voice.wav"

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK,
               input_device_index=0)

print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

# chatbot

In [None]:
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

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
!pip install -q sentence-transformers faiss-cpu langchain langchain-community bitsandbytes accelerate gTTS diffusers

In [22]:
# record audio
import pyaudio

# generate audio
from gtts import gTTS
import IPython

# generate image 
from diffusers import StableDiffusionPipeline
from diffusers import DPMSolverMultistepScheduler
import random
import matplotlib.pyplot as plt

# understand images
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import requests

# RAG
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# chatbot 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
from transformers import TextIteratorStreamer
from threading import Thread
import copy

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
model_id = "Arthur-LAGACHERIE/Gemma-2-2b-4bit"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
#model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [10]:
def download_embd_model(name, device="cpu"):
    modelPath = name
    model_kwargs = {'device':device}
    encode_kwargs = {'normalize_embeddings': False}
    embeddings_model = HuggingFaceEmbeddings(
        model_name=modelPath,
        model_kwargs=model_kwargs, 
        encode_kwargs=encode_kwargs
    )
    return embeddings_model


def describe_image(text, model, processor):
    img_path_or_URL = text.split("<img>")[1]
    print("\033[91m=> searching image at", img_path_or_URL, "\033[0m")
    
    if "http" in img_path_or_URL: # URL
        image = Image.open(requests.get(img_path_or_URL, stream=True).raw)
    else:
        image = Image.open(img_path_or_URL)
    
    prompt = "describe this image"
    model_inputs = processor(text=prompt, images=image, return_tensors="pt")
    input_len = model_inputs["input_ids"].shape[-1]

    with torch.inference_mode():
        generation = model.generate(**model_inputs, max_new_tokens=128, do_sample=False)
        generation = generation[0][input_len:]
        decoded = processor.decode(generation, skip_special_tokens=True)
        
    return decoded
    
def generate_img(prompt, pipe):
    generator = torch.Generator(device).manual_seed(random.randint(0, 1500))
    image = pipe(prompt, generator=generator, num_inference_steps=50, num_images_per_prompt=1).images[0]
    return image

def load_sd(device="cuda"):
    pipe = StableDiffusionPipeline.from_single_file("https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors")
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    pipe = pipe.to(device)
    return pipe


def generate_audio(text):
    audio = gTTS(text)
    audio.save('audio.mp3')
    IPython.display.display(IPython.display.Audio("audio.mp3", autoplay=True))
    

def generate(model, tokenizer, use_rag=True, use_img_desc=True, use_img_gen=True, use_audio_gen=False, embd_model_name="sentence-transformers/all-MiniLM-l6-v2", RAG_folder="path/to/your/folder"):
    history = ""
    if use_rag:
        print("\033[91m=> download RAG\033[0m")
        embeddings_model = download_embd_model(name=embd_model_name)
        db = FAISS.load_local(RAG_folder, embeddings_model, allow_dangerous_deserialization=True)
        
    img_desc_load = False
    pipe_loaded = False
    
    question = input("User:")
    while question != "x":
        img_generated = False
        
        # RAG
        context = db.similarity_search(question)[0].page_content
        
        # understand images
        img_desc = "None"
        if "<img>" in question and use_img_desc:
            if not img_desc_load:
                print("\033[91m=> download image descriptor\033[0m")
                model_id = "Arthur-LAGACHERIE/PaliGemma-4bit"
                model_img = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
                processor = AutoProcessor.from_pretrained(model_id)
                img_desc_load = True
                
            print("\033[91m=> get image description\033[0m")
            img_desc = describe_image(copy.deepcopy(question), model_img, processor)
            print(img_desc)
            
            list_question = question.split("<img>")
            list_question.append(" .")
            question = list_question[0] + ''.join(list_question[1:])
            context = ""
        
        # Prompt
        prompt = """
### Context: {context}
### Image description: {img_desc}
### System: With the precedent context, discussion, and your personal knowledge answer at the following question.
If the question is to describe an image use the Image Description.
You can generate an image by writing "<imggen>prompt for generation<imggen>"
### Question: {question}
        """.format(context=context, question=question, img_desc=img_desc)
        
        chat = [
            { "role": "user", "content": prompt},
        ]
        question = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
        discussion = history + question
        history = history + question
        
        discussion = tokenizer(discussion, return_tensors="pt").to(device)
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
        generation_kwargs = dict(discussion, streamer=streamer, max_new_tokens=1500)

        thread = Thread(target=model.generate, kwargs=generation_kwargs)
        print("Asssistant: ", end="")
        thread.start()
        answer = ""
        for new_text in streamer:
            print(new_text, end="")
            answer += new_text
            history += new_text
            
            if answer.count("<imggen>") == 2 and not img_generated:
                if not pipe_loaded:
                    pipe_img_gen = load_sd()
                    pipe_loaded = True
                    
                prompt = answer.split("<imggen>")[1]
                print("\033[91m=> generate image of {prompt}\033[0m".format(prompt=prompt))
                image = generate_img(prompt, pipe_img_gen)
                plt.imshow(image)
                plt.show()
                
                img_generated = True
                
        if use_audio_gen:
            generate_audio(answer.replace('<end_of_turn>', ''))
                
                
        question = input("User:")


In [None]:
generate(model, tokenizer, use_audio_gen=True, RAG_folder="/kaggle/working/MyRAG")

[91m=> download RAG[0m


User: hello how are you


Asssistant: hello! I'm doing well, thank you. 😊 

How are you? 
<end_of_turn>

In [None]:
describe the images <img>https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true<img>

In [None]:
<img>https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true<img>