In [1]:
# Dependencies:

#!pip install transformers ipywidgets IPython

# If running on Colab:
#!git clone https://github.com/sam-paech/antislop-sampler.git
#!mv antislop-sampler/src .
#!mv antislop-sampler/slop_phrase_prob_adjustments.json .

In [2]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)
from src.antislop_generate import generate_antislop, chat_antislop

# Enable efficient transfer for Hugging Face models
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = "1"

# Set the device to 'cuda' if available, else 'cpu'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Specify the model name (replace with your preferred model)
model_name = "unsloth/Llama-3.2-1B-Instruct"
#model_name = "unsloth/Llama-3.2-3B-Instruct"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True)
model.to(device)
print('Model loaded')

Model loaded


In [3]:
import json
if os.path.exists('banned_tokens.json'):
    with open('banned_tokens.json', 'r') as f:
        slop_phrase_prob_adjustments = dict(json.load(f)[:5000])

In [4]:
prompt = "Write a story about Elara, the weaver of tapestries in future Technopolis. In the bustling city, a group of "

In [5]:
# Chat generation with streaming
messages = [
    {"role": "user", "content": prompt}
]
tokens = []
text = ''
for token in chat_antislop(
    model=model,
    tokenizer=tokenizer,
    messages=messages,
    max_new_tokens=600,
    # Antislop sampling may be less reliable at low temperatures.
    #temperature=1,    
    #min_p=0.1,
    temperature=0.01,
    # The adjustment_strength param scales how strongly the probability adjustments are applied.
    # A value of 1 means the values in slop_phrase_prob_adjustments (or the defaults) are used unmodified.
    # Reasonable values are 0 (disabled) thru 100+ (effectively banning the list).
    adjustment_strength=100.0,
    # Optional: Provide a list of slop phrases and probability adjustments
    slop_phrase_prob_adjustments=slop_phrase_prob_adjustments,
    enforce_json=False,
    antislop_enabled=True,
    ban_slop_first_tokens=False,
    streaming=True,
    stream_smoothing=True, # On by default; this will smooth out the stutters from backtracking.
):
    tokens.append(token)
    full_text = tokenizer.decode(tokens, skip_special_tokens=True)
    new_text = full_text[len(text):]
    text = full_text
    print(new_text, end='', flush=True)

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


In the heart of the vast, sprawling city of Zha'thik, where futuristic skyscrapers pierced the sky and neon lights danced across the streets, there lived a young woman named Eldrida. She was known throughout the city as the 'Bridger of the Past,' a skilled artisan renowned for her extraordinary talent in transforming ancient wisdom into breathtaking works of art.

Eldrida's workshop, aptly named 'The Lost Threads,' was a cozy, dimly lit sanctuary in the midst of the chaotic city. The air was thick with the scent of old books, wood shavings, and the faint hint of magic. The walls were adorned with stunning, hand-embroidered samples of her previous work, each one telling a story of a bygone era.

One ordinary day, as Eldrida was busy gluing threads onto a new canvas, a group of five individuals entered her workshop. They were a diverse bunch, each with their own unique skills and stories to share. There was Arin, a young inventor with a passion for robotics; Zephyr, a charismatic perform