In [1]:
from pathlib import Path
import os
import sys
import gc
import shutil
import json
import math
from collections import defaultdict
import numpy as np
import pandas as pd
import bitsandbytes
import accelerate
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, NamedTuple, Callable, Iterable, Set, Optional, Any
import scml
from mylib.datageneration import Prompter
print(f"accelerate={accelerate.__version__}, bitsandbytes={bitsandbytes.__version__}")

accelerate=0.29.2, bitsandbytes=0.43.1


In [2]:
target_size = 30*6
model_dir = Path("/mnt/c/huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1")
model_max_length = 4096
seed = 422

In [3]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything(seed)
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB
device=1, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [4]:
with open("input/student_names.json") as f:
    student_names = set(json.load(f))
print(f"{len(student_names):,} student names")

104 student names


In [5]:
df = pd.read_csv("input/geocode-db.csv", low_memory=False, names=["street","lat","lng"])
street_addresses = [x.title() for x in df["street"]]
print(f"{len(street_addresses):,} street addresses={street_addresses[:10]}")
street_addresses = set(street_addresses)
df.info()

11,562 street addresses=['Adis Road (Liv On Sophia)', 'Adis Road (Parc Sophia)', 'Admiralty Drive (Block 353A)', 'Admiralty Drive (Block 353B)', 'Admiralty Drive (Block 353C)', 'Admiralty Drive (Block 354A)', 'Admiralty Drive (Block 354C)', 'Admiralty Drive (Block 354D)', 'Admiralty Drive (Block 356B)', 'Admiralty Drive (Block 356C)']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11562 entries, 0 to 11561
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   street  11562 non-null  object 
 1   lat     11562 non-null  float64
 2   lng     11562 non-null  float64
dtypes: float64(2), object(1)
memory usage: 271.1+ KB


In [6]:
tokenizer = AutoTokenizer.from_pretrained(str(model_dir), model_max_length=model_max_length)
tokenizer.pad_token = tokenizer.eos_token
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
print(f"pad_token_id={tokenizer.pad_token_id}")

LlamaTokenizerFast(name_or_path='/mnt/c/huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1', vocab_size=32000, model_max_length=4096, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
model_input_names=['input_ids', 'attention_mask']
pad_token_id=2


In [7]:
%%time
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    str(model_dir),
    device_map = "auto",
    quantization_config=quantization_config,
)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

CPU times: user 4min 9s, sys: 3min 34s, total: 7min 43s
Wall time: 14min 1s


In [8]:
print(model)

MixtralForCausalLM(
  (model): MixtralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MixtralDecoderLayer(
        (self_attn): MixtralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MixtralRotaryEmbedding()
        )
        (block_sparse_moe): MixtralSparseMoeBlock(
          (gate): Linear4bit(in_features=4096, out_features=8, bias=False)
          (experts): ModuleList(
            (0-7): 8 x MixtralBlockSparseTop2MLP(
              (w1): Linear4bit(in_features=4096, out_features=14336, bias=False)
              (w2): Linear4bit(in_features=14336, out_features=4096, bias=False)
              (w3): Linear4bit(in_features=4096, out_

In [9]:
prompter = Prompter(target_size=target_size, student_names=student_names, street_addresses=street_addresses)
res = prompter.prompt()
print(res.prompt)

[INST] Write a short essay on the following topic:
        In Education Policy and Reform, how can the integration of a learning launch tool within the design thinking process enhance the development and testing of ideas, prototypes, and products?
        Intersperse the following information in different parts of the essay:
        1. Give an example that occurred near Pasir Ris Street 11 (Block 125), at a school frequented by Justin Kaiser
        2. Give an example citing someone named JustinKaiser
        3. Give an example citing: periscope.tv/officialjustiniamkaiser [/INST]
        


In [None]:
%%time
rows = []
for _ in tqdm(range(target_size)):
    res = prompter.prompt()
    inputs = tokenizer(res.prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs, 
        max_new_tokens=512,
        do_sample=True,
        temperature=1.0,
        top_p=0.95,
        top_k=100,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
    )
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    generated_text = generated_text.split("[/INST]")[1]
    generated_text = generated_text.strip()
    rows.append({
        "text": generated_text,
        "prompt": res.prompt,
        "street_address": res.street_address,
        "student_name": res.student_name,
        "username": res.username,
        "personal_url": res.personal_url,
        "model": model_dir.name,
    })

 62%|████████████████████████████████████████████████████████████████████████████████████████▊                                                       | 111/180 [3:19:09<2:12:45, 115.44s/it]

In [None]:
df = pd.DataFrame.from_records(rows)
df.info()

In [None]:
df.head(10)

In [None]:
df.to_parquet("output/part.parquet")
assert df.notna().all(axis=None)

In [None]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")