In [3]:
import os
import json

base_path = '../output'
candidate_folders = ['bullrich', 'massa', 'milei']

# Store all transcribed phrases for each candidate
transcriptions = {candidate: [] for candidate in candidate_folders}

for candidate in candidate_folders:
    folder_path = os.path.join(base_path, candidate)

    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            filepath = os.path.join(folder_path, filename)

            with open(filepath, 'r') as f:
                content = json.load(f)
                if isinstance(content, dict):
                    transcriptions[candidate].append(content)
                elif isinstance(content, list):
                    transcriptions[candidate].extend(content)

In [8]:
!pip install vllm

Collecting vllm
  Obtaining dependency information for vllm from https://files.pythonhosted.org/packages/26/ae/804920b9bb72503d2c7ee12f4781306472aa8c24185d03cc29e0a012675a/vllm-0.1.7-cp310-cp310-manylinux1_x86_64.whl.metadata
  Downloading vllm-0.1.7-cp310-cp310-manylinux1_x86_64.whl.metadata (6.7 kB)
Collecting ninja (from vllm)
  Downloading ninja-1.11.1-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (145 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.0/146.0 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting ray>=2.5.1 (from vllm)
  Obtaining dependency information for ray>=2.5.1 from https://files.pythonhosted.org/packages/82/e9/d7d85bdc8b1b3101c760d42a63493b8b4092c9ade9dce9f8240b328e488a/ray-2.7.0-cp310-cp310-manylinux2014_x86_64.whl.metadata
  Downloading ray-2.7.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (13 kB)
Collecting transformers>=4.33.1 (from vllm)
  Obtaining dependency information for transformers>=4.33.1 from https://file

In [9]:
from vllm import LLM, SamplingParams

In [10]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

In [11]:
llm = LLM(model="facebook/opt-125m")

Downloading (…)lve/main/config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

INFO 09-17 19:24:34 llm_engine.py:72] Initializing an LLM engine with config: model='facebook/opt-125m', tokenizer='facebook/opt-125m', tokenizer_mode=auto, trust_remote_code=False, dtype=torch.float16, download_dir=None, load_format=auto, tensor_parallel_size=1, seed=0)


Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

INFO 09-17 19:24:40 llm_engine.py:199] # GPU blocks: 34634, # CPU blocks: 7281


In [12]:
outputs = llm.generate(prompts, sampling_params)

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Processed prompts: 100%|██████████| 4/4 [00:00<00:00, 31.41it/s]

Prompt: 'Hello, my name is', Generated text: " Joel. I'm from Massachusetts and live in Melbourne, Australia.\nI'm"
Prompt: 'The president of the United States is', Generated text: ' about to be arrested in Europe for allegedly meddling in the 2016 election.\n\n'
Prompt: 'The capital of France is', Generated text: ' becoming a state of chaos with a significant urban and industrial boom. France’'
Prompt: 'The future of AI is', Generated text: ' not as simple as you think, and you have to understand it in order to'





In [30]:
prompts = [x['text'].strip() for x in transcriptions[candidate][0]['segments']]

In [42]:
base = "{Who}: {Said}"
prompts = []
for x in transcriptions[candidate][150]['segments']:
    if x['is_candidate']: prompts.append(f"Milei: {x['text'].strip()}")
    else: prompts.append(f"Host: {x['text'].strip()}")

In [45]:
sum(len(x) for x in prompts)

18051.0

In [47]:
!pip install ctransformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting ctransformers
  Obtaining dependency information for ctransformers from https://files.pythonhosted.org/packages/14/50/0b608e2abee4fc695b4e7ff5f569f5d32faf84a49e322034716fa157d1cf/ctransformers-0.2.27-py3-none-any.whl.metadata
  Downloading ctransformers-0.2.27-py3-none-any.whl.metadata (17 kB)
Collecting py-cpuinfo<10.0.0,>=9.0.0 (from ctransformers)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Downloading ctransformers-0.2.27-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: py-cpuinfo, ctransformers
Successfully installed ctransformers-0.2

In [52]:
from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Airoboros-L2-13B-2_1-YaRN-64K-GGUF", 
    model_file="airoboros-l2-13b-2.1-yarn-64k.Q4_K_M.gguf", 
    model_type="llama", 
    gpu_layers=50)

print(llm("AI is going to"))

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

OSError: libcudart.so.12: cannot open shared object file: No such file or directory

In [51]:
!pip uninstall torchaudio -y

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Found existing installation: torchaudio 2.0.2
Uninstalling torchaudio-2.0.2:
  Successfully uninstalled torchaudio-2.0.2
