# RWKV-v4neo Fine-Tuning

[RWKV](https://github.com/BlinkDL/RWKV-LM) is an RNN with transformer-level performance


This notebook aims to streamline fine-tuning RWKV-v4 models


## Setup

In [None]:
#@title Google Drive Options { display-mode: "form" }
save_models_to_drive = True #@param {type:"boolean"}
drive_mount = '/content/drive' #@param {type:"string"}
output_dir = 'rwkv-v4neo-rnn-pile-tuning' #@param {type:"string"}
tuned_model_name = 'tuned-python' #@param {type:"string"}

import os
from google.colab import drive
if save_models_to_drive:
    from google.colab import drive
    drive.mount(drive_mount)

output_path = f"{drive_mount}/MyDrive/{output_dir}" if save_models_to_drive else f"/content/{output_dir}"
os.makedirs(f"{output_path}/{tuned_model_name}", exist_ok=True)
os.makedirs(f"{output_path}/base_models/", exist_ok=True)

print(f"Saving models to {output_path}")

Mounted at /content/drive
Saving models to /content/drive/MyDrive/rwkv-v4neo-rnn-pile-tuning


In [None]:
!nvidia-smi

Thu Jul 13 06:21:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!git clone https://github.com/blinkdl/RWKV-LM
repo_dir = "/content/RWKV-LM/RWKV-v4neo"
%cd $repo_dir

Cloning into 'RWKV-LM'...
remote: Enumerating objects: 1830, done.[K
remote: Counting objects: 100% (825/825), done.[K
remote: Compressing objects: 100% (155/155), done.[K
remote: Total 1830 (delta 772), reused 685 (delta 670), pack-reused 1005[K
Receiving objects: 100% (1830/1830), 15.81 MiB | 17.13 MiB/s, done.
Resolving deltas: 100% (1160/1160), done.
/content/RWKV-LM/RWKV-v4neo


In [None]:
!pip install transformers pytorch-lightning==1.9 deepspeed wandb ninja

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning==1.9
  Downloading pytorch_lightning-1.9.0-py3-none-any.whl (825 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m825.8/825.8 kB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deepspeed
  Downloading deepspeed-0.9.5.tar.gz (809 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m809.9/809.9 kB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wandb
  Downloading wandb-0.15.5-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m86.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja
  Downloading ninja-1.11.1-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (145 kB)
[2K     [

## Load Base Model




In [None]:
#@title Base Model Options
#@markdown Using any of the listed options will download the checkpoint from huggingface

base_model_name = "RWKV-4-Pile-169M" #@param ["RWKV-4-Pile-1B5", "RWKV-4-Pile-430M", "RWKV-4-Pile-169M"]
base_model_url = f"https://huggingface.co/BlinkDL/{base_model_name.lower()}"

if base_model_name == "RWKV-4-Pile-169M":
    n_layer = 12
    n_embd = 768
elif base_model_name == "RWKV-4-Pile-430M":
    n_layer = 24
    n_embd = 1024
elif base_model_name == "RWKV-4-Pile-1B5":
    n_layer = 24
    n_embd = 2048

!git lfs clone $base_model_url

from glob import glob
base_model_path = glob(f"{base_model_name.lower()}/{base_model_name}*.pth")[0]

print(f"Using {base_model_path} as base")

          with new flags from 'git clone'

'git clone' has been updated in upstream Git to have comparable
speeds to 'git lfs clone'.
Cloning into 'rwkv-4-pile-169m'...
remote: Enumerating objects: 59, done.[K
remote: Total 59 (delta 0), reused 0 (delta 0), pack-reused 59[K
Unpacking objects: 100% (59/59), 6.26 KiB | 458.00 KiB/s, done.
Using rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023.pth as base


## Generate Training Data

In [None]:
#@title Training Data Options
#@markdown `input_file` should be the path to a single file that contains the text you want to fine-tune with.
#@markdown Either upload a file to this notebook instance or reference a file in your Google drive.

import numpy as np
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file=f'{repo_dir}/20B_tokenizer.json')

input_file = "/content/drive/MyDrive/table_train_3.txt" #@param {type:"string"}
output_file = 'train.npy'

print(f'Tokenizing {input_file} (VERY slow. please wait)')

data_raw = open(input_file, encoding="utf-8").read()
print(f'Raw length = {len(data_raw)}')

data_code = tokenizer.encode(data_raw)
print(f'Tokenized length = {len(data_code)}')

out = np.array(data_code, dtype='uint16')
np.save(output_file, out, allow_pickle=False)

Tokenizing /content/drive/MyDrive/table_train_3.txt (VERY slow. please wait)
Raw length = 19417
Tokenized length = 5955


## Training

In [None]:
#@title Begin Training with these Options { display-mode: "form" }
n_epoch = 100 #@param {type:"integer"}
epoch_save_frequency = 1 #@param {type:"integer"}
batch_size =  50 #@param {type:"integer"}
ctx_len = 384 #@param {type:"integer"}
precision = 'fp16' #@param ['fp16', 'bf16', 'bf32'] {type:"string"}

epoch_save_path = f"{output_path}/{tuned_model_name}"


!python train.py \
--load_model $base_model_path \
--wandb "" \
--proj_dir $output_dir \
--data_file  "train.npy" \
--data_type "numpy" \
--vocab_size 50277 \
--ctx_len $ctx_len \
--epoch_steps 1000 \
--epoch_count $n_epoch \
--epoch_begin 0 \
--epoch_save $epoch_save_frequency \
--micro_bsz 8 \
--n_layer $n_layer \
--n_embd $n_embd \
--pre_ffn 0 \
--head_qk 0 \
--lr_init 1e-5 \
--lr_final 1e-5 \
--warmup_steps 0 \
--beta1 0.9 \
--beta2 0.999 \
--adam_eps 1e-8 \
--accelerator gpu \
--devices 1 \
--precision $precision \
--strategy deepspeed_stage_2 \
--grad_cp 0

########## work in progress ##########
[2023-07-13 06:23:00,246] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)

############################################################################
#
# RWKV-4 FP16 on 1x1 GPU, bsz 1x1x8=8, deepspeed_stage_2 
#
# Data = train.npy (numpy), ProjDir = rwkv-v4neo-rnn-pile-tuning
#
# Epoch = 0 to 99 (will continue afterwards), save every 1 epoch
#
# Each "epoch" = 1000 steps, 8000 samples, 3072000 tokens
#
# Model = 12 n_layer, 768 n_embd, 384 ctx_len
#
# Adam = lr 1e-05 to 1e-05, warmup 0 steps, beta (0.9, 0.999), eps 1e-08
#
# Found torch 2.0.1+cu118, recommend 1.13.1+cu117 or newer
# Found deepspeed 0.9.5, recommend 0.7.0 (faster than newer versions)
# Found pytorch_lightning 1.9.0, recommend 1.9.1 or newer
#
############################################################################

{'load_model': 'rwkv-4-pile-169m/RWKV-4-Pile-169M-20220807-8023.pth', 'wandb': '', 'proj_dir': 'rwkv-v4neo-rnn-pile-t

In [None]:
import shutil

# Source file path
source_path = '/content/RWKV-LM/RWKV-v4neo/rwkv-v4neo-rnn-pile-tuning/rwkv-10.pth'

# Destination folder path
destination_path = '/content/drive/MyDrive'

# Transfer the file
shutil.move(source_path, destination_path)


'/content/drive/MyDrive/rwkv-10.pth'

In [None]:
!pip install ninja tokenizers



In [None]:
!git clone https://github.com/BlinkDL/ChatRWKV


Cloning into 'ChatRWKV'...
remote: Enumerating objects: 1531, done.[K
remote: Counting objects: 100% (512/512), done.[K
remote: Compressing objects: 100% (166/166), done.[K
remote: Total 1531 (delta 425), reused 372 (delta 342), pack-reused 1019[K
Receiving objects: 100% (1531/1531), 28.71 MiB | 17.57 MiB/s, done.
Resolving deltas: 100% (848/848), done.


In [None]:
'''#@title Select/Download Model { display-mode: "form" }
import urllib

#@markdown Select the model you'd like to use:
model_file = "/content/drive/MyDrive/rwkv-10.pth" #@param {type:"string"}
#@markdown It will first search `model_dir` for `model_file`.
#@markdown If it isn't valid path, it will attempt to download a `RWKV-v4-Raven` model from huggingface.
#@markdown To see which options you have, take a look at the [repo](https://huggingface.co/BlinkDL/rwkv-4-raven/).

#@markdown ---

#@markdown For example:
#@markdown - RWKV-v4-Raven-14B-v11x: `RWKV-4-Raven-14B-v11x-Eng99%-Other1%-20230501-ctx8192.pth`
#@markdown - RWKV-v4-Raven-7B-v11x: `RWKV-4-Raven-7B-v11x-Eng99%-Other1%-20230429-ctx8192.pth`
#@markdown - RWKV-v4-Raven-3B-v11: `RWKV-4-Raven-3B-v11-Eng99%-Other1%-20230425-ctx4096.pth`
#@markdown - RWKV-v4-Raven-1B5-v11: `RWKV-4-Raven-1B5-v11-Eng99%-Other1%-20230425-ctx4096.pth`
#@markdown - Custom Model: `/rwkv-subdirectory/custom-rwkv.pth`

model_path = f"{model_dir_path}/{model_file}"
if not os.path.exists(model_path):
    model_repo = f"https://huggingface.co/BlinkDL/rwkv-4-raven/resolve/main"
    model_url = f"{model_repo}/{urllib.parse.quote_plus(model_file)}"
    try:
        print(f"Downloading '{model_file}' from {model_url} this may take a while")
        urllib.request.urlretrieve(model_url, model_path)
        print(f"Using {model_path} as base")
    except Exception as e:
        print(f"Model '{model_file}' doesn't exist")
        raise Exception
else:
    print(f"Using {model_path} as base")'''

#@title Select/Download Model { display-mode: "form" }
import urllib
import os
#@markdown Select the model you'd like to use:
model_file = "/content/drive/MyDrive/rwkv-10.pth" #@param {type:"string"}
#@markdown It will first search `model_dir` for `model_file`.
#@markdown If it isn't valid path, it will attempt to download a `RWKV-v4-Raven` model from huggingface.
#@markdown To see which options you have, take a look at the [repo](https://huggingface.co/BlinkDL/rwkv-4-raven/).

#@markdown ---

#@markdown For example:
#@markdown - RWKV-v4-Raven-14B-v11x: `RWKV-4-Raven-14B-v11x-Eng99%-Other1%-20230501-ctx8192.pth`
#@markdown - RWKV-v4-Raven-7B-v11x: `RWKV-4-Raven-7B-v11x-Eng99%-Other1%-20230429-ctx8192.pth`
#@markdown - RWKV-v4-Raven-3B-v11: `RWKV-4-Raven-3B-v11-Eng99%-Other1%-20230425-ctx4096.pth`
#@markdown - RWKV-v4-Raven-1B5-v11: `RWKV-4-Raven-1B5-v11-Eng99%-Other1%-20230425-ctx4096.pth`
#@markdown - Custom Model: `/rwkv-subdirectory/custom-rwkv.pth`

'''model_path = f"{model_dir_path}/{model_file}"
if not os.path.exists(model_path):
    model_repo = f"https://huggingface.co/BlinkDL/rwkv-4-raven/resolve/main"
    model_url = f"{model_repo}/{urllib.parse.quote_plus(model_file)}"
    try:
        print(f"Downloading '{model_file}' from {model_url} this may take a while")
        urllib.request.urlretrieve(model_url, model_path)
        print(f"Using {model_path} as base")
    except Exception as e:
        print(f"Model '{model_file}' doesn't exist")
        raise Exception
else:
    print(f"Using {model_path} as base")'''
model_path = model_file

if not os.path.exists(model_path):
    print(f"Model '{model_file}' doesn't exist")
    raise Exception
else:
    print(f"Using {model_path} as base")


Using /content/drive/MyDrive/rwkv-10.pth as base


In [None]:
#@title Load Model {"display-mode": "form"}
import os, copy, types, gc, sys
sys.path.append('ChatRWKV/rwkv_pip_package/src')

import numpy as np
try:
    os.environ["CUDA_VISIBLE_DEVICES"] = sys.argv[1]
except:
    pass
np.set_printoptions(precision=4, suppress=True, linewidth=200)
args = types.SimpleNamespace()

print('ChatRWKV v4 https://github.com/BlinkDL/ChatRWKV')

import torch
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True

strategy = 'cuda fp16' #@param {"type": "string"}

#@markdown Strategy Examples:
#@markdown - `cpu fp32`
#@markdown - `cuda:0 fp16 -> cuda:1 fp16`
#@markdown - `cuda fp16i8 *10 -> cuda fp16`
#@markdown - `cuda fp16i8`
#@markdown - `cuda fp16i8 -> cpu fp32 *10`
#@markdown - `cuda fp16i8 *10+`

os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '1'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

CHAT_LEN_SHORT = 40
CHAT_LEN_LONG = 150
FREE_GEN_LEN = 256

CHUNK_LEN = 256 # split input into chunks to save VRAM (shorter -> slower)

########################################################################################################

from rwkv.model import RWKV
from rwkv.utils import PIPELINE

print(f'Loading model - {model_path}')
model = RWKV(model=model_path, strategy=strategy)
pipeline = PIPELINE(model, "ChatRWKV/v2/20B_tokenizer.json")
END_OF_TEXT = 0
END_OF_LINE = 187
END_OF_LINE_DOUBLE = 535
# pipeline = PIPELINE(model, "cl100k_base")
# END_OF_TEXT = 100257
# END_OF_LINE = 198

model_tokens = []
model_state = None

AVOID_REPEAT = '，：？！'
AVOID_REPEAT_TOKENS = []
for i in AVOID_REPEAT:
    dd = pipeline.encode(i)
    assert len(dd) == 1
    AVOID_REPEAT_TOKENS += dd

def run_rnn(tokens, newline_adj = 0):
    global model_tokens, model_state

    tokens = [int(x) for x in tokens]
    model_tokens += tokens
    # print(f'### model ###\n{tokens}\n[{pipeline.decode(model_tokens)}]')

    while len(tokens) > 0:
        out, model_state = model.forward(tokens[:CHUNK_LEN], model_state)
        tokens = tokens[CHUNK_LEN:]

    out[END_OF_LINE] += newline_adj # adjust \n probability

    if model_tokens[-1] in AVOID_REPEAT_TOKENS:
        out[model_tokens[-1]] = -999999999
    return out

all_state = {}
def save_all_stat(srv, name, last_out):
    n = f'{name}_{srv}'
    all_state[n] = {}
    all_state[n]['out'] = last_out
    all_state[n]['rnn'] = copy.deepcopy(model_state)
    all_state[n]['token'] = copy.deepcopy(model_tokens)

def load_all_stat(srv, name):
    global model_tokens, model_state
    n = f'{name}_{srv}'
    model_state = copy.deepcopy(all_state[n]['rnn'])
    model_tokens = copy.deepcopy(all_state[n]['token'])
    return all_state[n]['out']

# Model only saw '\n\n' as [187, 187] before, but the tokenizer outputs [535] for it at the end
def fix_tokens(tokens):
    if len(tokens) > 0 and tokens[-1] == END_OF_LINE_DOUBLE:
        tokens = tokens[:-1] + [END_OF_LINE, END_OF_LINE]
    return tokens

#@title Inference Setup {"display-mode": "form"}
#@markdown Inference properties:
temp = 1.1 #@param {"type": "number"}
top_p = 0.7 #@param {"type": "number"}
presence_penalty = 0.2 #@param {"type": "number"}
frequency_penalty = 0.2 #@param {"type": "number"}
# Run inference
from prompt_toolkit import prompt

PROMPT_FILE = 'ChatRWKV/v2/prompt/default/English-2.py'

def load_prompt(PROMPT_FILE):
    variables = {}
    with open(PROMPT_FILE, 'rb') as file:
        exec(compile(file.read(), PROMPT_FILE, 'exec'), variables)
    user, bot, interface, init_prompt = variables['user'], variables['bot'], variables['interface'], variables['init_prompt']
    init_prompt = init_prompt.strip().split('\n')
    for c in range(len(init_prompt)):
        init_prompt[c] = init_prompt[c].strip().strip('\u3000').strip('\r')
    init_prompt = '\n' + ('\n'.join(init_prompt)).strip() + '\n\n'
    return user, bot, interface, init_prompt

user, bot, interface, init_prompt = load_prompt(PROMPT_FILE)
out = run_rnn(fix_tokens(pipeline.encode(init_prompt)))
save_all_stat('', 'chat_init', out)
gc.collect()
torch.cuda.empty_cache()

srv_list = ['dummy_server']
for s in srv_list:
    save_all_stat(s, 'chat', out)

def reply_msg(msg):
    print(f'{bot}{interface} {msg}\n')

def on_message(message):
    global model_tokens, model_state, user, bot, interface, init_prompt

    srv = 'dummy_server'

    msg = message.replace('\\n','\n').strip()

    x_temp = temp
    x_top_p = top_p
    if ("-temp=" in msg):
        x_temp = float(msg.split("-temp=")[1].split(" ")[0])
        msg = msg.replace("-temp="+f'{x_temp:g}', "")
        # print(f"temp: {x_temp}")
    if ("-top_p=" in msg):
        x_top_p = float(msg.split("-top_p=")[1].split(" ")[0])
        msg = msg.replace("-top_p="+f'{x_top_p:g}', "")
        # print(f"top_p: {x_top_p}")
    if x_temp <= 0.2:
        x_temp = 0.2
    if x_temp >= 5:
        x_temp = 5
    if x_top_p <= 0:
        x_top_p = 0
    msg = msg.strip()

    if msg == '+reset':
        out = load_all_stat('', 'chat_init')
        save_all_stat(srv, 'chat', out)
        reply_msg("Chat reset.")
        return

    # use '+prompt {path}' to load a new prompt
    elif msg[:8].lower() == '+prompt ':
        print("Loading prompt...")
        try:
            PROMPT_FILE = msg[8:].strip()
            user, bot, interface, init_prompt = load_prompt(PROMPT_FILE)
            out = run_rnn(fix_tokens(pipeline.encode(init_prompt)))
            save_all_stat(srv, 'chat', out)
            print("Prompt set up.")
            gc.collect()
            torch.cuda.empty_cache()
        except:
            print("Path error.")

    elif msg[:5].lower() == '+gen ' or msg[:3].lower() == '+i ' or msg[:4].lower() == '+qa ' or msg[:4].lower() == '+qq ' or msg.lower() == '+++' or msg.lower() == '++':

        if msg[:5].lower() == '+gen ':
            new = '\n' + msg[5:].strip()
            # print(f'### prompt ###\n[{new}]')
            model_state = None
            model_tokens = []
            out = run_rnn(pipeline.encode(new))
            save_all_stat(srv, 'gen_0', out)

        elif msg[:3].lower() == '+i ':
            msg = msg[3:].strip().replace('\r\n','\n').replace('\n\n','\n')
            new = f'''
Below is an instruction that describes a task. Write a response that appropriately completes the request.

# Instruction:
{msg}

# Response:
'''
            # print(f'### prompt ###\n[{new}]')
            model_state = None
            model_tokens = []
            out = run_rnn(pipeline.encode(new))
            save_all_stat(srv, 'gen_0', out)

        elif msg[:4].lower() == '+qq ':
            new = '\nQ: ' + msg[4:].strip() + '\nA:'
            # print(f'### prompt ###\n[{new}]')
            model_state = None
            model_tokens = []
            out = run_rnn(pipeline.encode(new))
            save_all_stat(srv, 'gen_0', out)

        elif msg[:4].lower() == '+qa ':
            out = load_all_stat('', 'chat_init')

            real_msg = msg[4:].strip()
            new = f"{user}{interface} {real_msg}\n\n{bot}{interface}"
            # print(f'### qa ###\n[{new}]')

            out = run_rnn(pipeline.encode(new))
            save_all_stat(srv, 'gen_0', out)

        elif msg.lower() == '+++':
            try:
                out = load_all_stat(srv, 'gen_1')
                save_all_stat(srv, 'gen_0', out)
            except:
                return

        elif msg.lower() == '++':
            try:
                out = load_all_stat(srv, 'gen_0')
            except:
                return

        begin = len(model_tokens)
        out_last = begin
        occurrence = {}
        for i in range(FREE_GEN_LEN+100):
            for n in occurrence:
                out[n] -= (presence_penalty + occurrence[n] * frequency_penalty)
            token = pipeline.sample_logits(
                out,
                temperature=x_temp,
                top_p=x_top_p,
            )
            if token == END_OF_TEXT:
                break
            if token not in occurrence:
                occurrence[token] = 1
            else:
                occurrence[token] += 1

            if msg[:4].lower() == '+qa ':# or msg[:4].lower() == '+qq ':
                out = run_rnn([token], newline_adj=-2)
            else:
                out = run_rnn([token])

            xxx = pipeline.decode(model_tokens[out_last:])
            if '\ufffd' not in xxx: # avoid utf-8 display issues
                print(xxx, end='', flush=True)
                out_last = begin + i + 1
                if i >= FREE_GEN_LEN:
                    break
        print('\n')
        # send_msg = pipeline.decode(model_tokens[begin:]).strip()
        # print(f'### send ###\n[{send_msg}]')
        # reply_msg(send_msg)
        save_all_stat(srv, 'gen_1', out)

    else:
        if msg.lower() == '+':
            try:
                out = load_all_stat(srv, 'chat_pre')
            except:
                return
        else:
            out = load_all_stat(srv, 'chat')
            msg = msg.strip().replace('\r\n','\n').replace('\n\n','\n')
            new = f"{user}{interface} {msg}\n\n{bot}{interface}"
            # print(f'### add ###\n[{new}]')
            out = run_rnn(pipeline.encode(new), newline_adj=-999999999)
            save_all_stat(srv, 'chat_pre', out)

        begin = len(model_tokens)
        out_last = begin
        print(f'{bot}{interface}', end='', flush=True)
        occurrence = {}
        for i in range(999):
            if i <= 0:
                newline_adj = -999999999
            elif i <= CHAT_LEN_SHORT:
                newline_adj = (i - CHAT_LEN_SHORT) / 10
            elif i <= CHAT_LEN_LONG:
                newline_adj = 0
            else:
                newline_adj = min(3, (i - CHAT_LEN_LONG) * 0.25) # MUST END THE GENERATION

            for n in occurrence:
                out[n] -= (presence_penalty + occurrence[n] * frequency_penalty)
            token = pipeline.sample_logits(
                out,
                temperature=x_temp,
                top_p=x_top_p,
            )
            # if token == END_OF_TEXT:
            #     break
            if token not in occurrence:
                occurrence[token] = 1
            else:
                occurrence[token] += 1

            out = run_rnn([token], newline_adj=newline_adj)
            out[END_OF_TEXT] = -999999999  # disable <|endoftext|>

            xxx = pipeline.decode(model_tokens[out_last:])
            if '\ufffd' not in xxx: # avoid utf-8 display issues
                print(xxx, end='', flush=True)
                out_last = begin + i + 1

            send_msg = pipeline.decode(model_tokens[begin:])
            if '\n\n' in send_msg:
                send_msg = send_msg.strip()
                break

        save_all_stat(srv, 'chat', out)


ChatRWKV v4 https://github.com/BlinkDL/ChatRWKV


Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py310_cu118/wkv_cuda...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/wkv_cuda/build.ninja...
Building extension module wkv_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module wkv_cuda...


Loading model - /content/drive/MyDrive/rwkv-10.pth
RWKV_JIT_ON 1 RWKV_CUDA_ON 1 RESCALE_LAYER 6

Loading /content/drive/MyDrive/rwkv-10.pth ...
Strategy: (total 12+1=13 layers)
* cuda [float16, float16], store 13 layers
0-cuda-float16-float16 1-cuda-float16-float16 2-cuda-float16-float16 3-cuda-float16-float16 4-cuda-float16-float16 5-cuda-float16-float16 6-cuda-float16-float16 7-cuda-float16-float16 8-cuda-float16-float16 9-cuda-float16-float16 10-cuda-float16-float16 11-cuda-float16-float16 12-cuda-float16-float16 
emb.weight                        f16      cpu  50277   768 
blocks.0.ln1.weight               f16   cuda:0    768       
blocks.0.ln1.bias                 f16   cuda:0    768       
blocks.0.ln2.weight               f16   cuda:0    768       
blocks.0.ln2.bias                 f16   cuda:0    768       
blocks.0.att.time_decay           f32   cuda:0    768       
blocks.0.att.time_first           f32   cuda:0    768       
blocks.0.att.time_mix_k           f16   cuda:0    

In [None]:
#@title Chat {"display-mode": "form"}

#@markdown Running this cell will start the chat. Simply type your message in the input

#@markdown Commands:
#@markdown - `+` to get an alternate chat reply
#@markdown - `+reset` to reset the chat
#@markdown - `+gen YOUR PROMPT` for a free single-round generation with any prompt
#@markdown - `+i YOUR INSTRUCT` for a free single-round generation with any instruct
#@markdown - `+++` to continue the last free generation (only for `+gen` / `+i`)
#@markdown - `++` to retry the last free generation (only for `+gen` / `+i`)

#@markdown Remember to `+reset` periodically to clean up the bot's memory.

while True:
    msg = input("Bob: ")
    if len(msg.strip()) > 0:
        on_message(msg)
    else:
        print('Error: please say something')

Bob: +gen serial number 25 26 27 <start>"
sno<stop>
"name Ava Benjamin Charlotte <start>"name<stop>
"birthdate 1993 1996 1991 <start>"birthdate<stop>
"job Designer Engineer Lawyer <start>"job<stop>
"serial number 29 30 31 <start>"sno<stop>
"name Emma James Noah <start>"name<stop>
"birthdate 1995 1992 1997 <start>"birthdate<stop>
"job Doctor Architect Engineer <start>"job<stop>
"serial number 33 34 35 <start>"sno<stop>
"name William Mia Ethan <start>"name<stop>
"birthdate 1997 1994 1999 <start>"birthdate<stop>
"job Lawyer Designer Engineer <start>"job<stop>
"serial number 37 38 39 <start>"sno<stop>
"name Sophia Noah Emma <start>"name<stop>
"birthdate 1991 1996 1994 <start>"birthdate<stop>
"job Designer Engineer Doctor <start>"job<stop>
"serial number 41 42 43 <start>"sno<stop>
"name Benjamin Isabella David <start>"name<stop>
"birthdate 1988 1995

Bob: +gen "name Emma James Noah <start>"
name<stop>
"birthdate 1995 1992 1997 <start>"birthdate<stop>
"job Doctor Architect Engineer <start>"j

In [None]:
'''import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')'''
!pip install langchain rwkv ninja



In [None]:
!git clone https://github.com/BlinkDL/RWKV-LM

In [None]:
#@title Select/Download Model { display-mode: "form" }
import urllib
import os
#@markdown Select the model you'd like to use:
model_file = "/content/drive/MyDrive/rwkv-12.pth" #@param {type:"string"}
#@markdown It will first search `model_dir` for `model_file`.
#@markdown If it isn't valid path, it will attempt to download a `RWKV-v4-Raven` model from huggingface.
#@markdown To see which options you have, take a look at the [repo](https://huggingface.co/BlinkDL/rwkv-4-raven/).

#@markdown ---

#@markdown For example:
#@markdown - RWKV-v4-Raven-14B-v11x: `RWKV-4-Raven-14B-v11x-Eng99%-Other1%-20230501-ctx8192.pth`
#@markdown - RWKV-v4-Raven-7B-v11x: `RWKV-4-Raven-7B-v11x-Eng99%-Other1%-20230429-ctx8192.pth`
#@markdown - RWKV-v4-Raven-3B-v11: `RWKV-4-Raven-3B-v11-Eng99%-Other1%-20230425-ctx4096.pth`
#@markdown - RWKV-v4-Raven-1B5-v11: `RWKV-4-Raven-1B5-v11-Eng99%-Other1%-20230425-ctx4096.pth`
#@markdown - Custom Model: `/rwkv-subdirectory/custom-rwkv.pth`

'''model_path = f"{model_dir_path}/{model_file}"
if not os.path.exists(model_path):
    model_repo = f"https://huggingface.co/BlinkDL/rwkv-4-raven/resolve/main"
    model_url = f"{model_repo}/{urllib.parse.quote_plus(model_file)}"
    try:
        print(f"Downloading '{model_file}' from {model_url} this may take a while")
        urllib.request.urlretrieve(model_url, model_path)
        print(f"Using {model_path} as base")
    except Exception as e:
        print(f"Model '{model_file}' doesn't exist")
        raise Exception
else:
    print(f"Using {model_path} as base")'''
model_path = model_file

if not os.path.exists(model_path):
    print(f"Model '{model_file}' doesn't exist")
    raise Exception
else:
    print(f"Using {model_path} as base")


In [None]:
#@title Load Model
import os
os.environ["RWKV_CUDA_ON"] = '1'
os.environ["RWKV_JIT_ON"] = '1'

from langchain.llms import RWKV

strategy = "cuda fp16i8 *20 -> cuda fp16" #@param {"type":"string"}
model = RWKV(model=model_path, strategy=strategy, tokens_path="RWKV-LM/RWKV-v4/20B_tokenizer.json")

RWKV_JIT_ON 1 RWKV_CUDA_ON 1 RESCALE_LAYER 6

Loading /content/drive/MyDrive/rwkv-11.pth ...
Strategy: (total 12+1=13 layers)
* cuda [float16, uint8], store 13 layers
* cuda fp16, store 0 layers
0-cuda-float16-uint8 1-cuda-float16-uint8 2-cuda-float16-uint8 3-cuda-float16-uint8 4-cuda-float16-uint8 5-cuda-float16-uint8 6-cuda-float16-uint8 7-cuda-float16-uint8 8-cuda-float16-uint8 9-cuda-float16-uint8 10-cuda-float16-uint8 11-cuda-float16-uint8 12-cuda-float16-uint8 
emb.weight                        f16      cpu  50277   768 
blocks.0.ln1.weight               f16   cuda:0    768       
blocks.0.ln1.bias                 f16   cuda:0    768       
blocks.0.ln2.weight               f16   cuda:0    768       
blocks.0.ln2.bias                 f16   cuda:0    768       
blocks.0.att.time_decay           f32   cuda:0    768       
blocks.0.att.time_first           f32   cuda:0    768       
blocks.0.att.time_mix_k           f16   cuda:0    768       
blocks.0.att.time_mix_v           f16   

In [None]:
#@title Chain
#@markdown A simple chain example. You first create the instruction template, and feed in your prompt as the instruction variable.

from langchain.prompts import PromptTemplate
task = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
# Instruction:
{instruction}

# Response:
"""
instruction = "Function to get the count of unique values in a column" #@param {type:"string"}

prompt = PromptTemplate(
    input_variables=["instruction"],
    template=task,
)

from langchain.chains import LLMChain
chain = LLMChain(llm=model, prompt=prompt)

print(chain.run(instruction))

#@markdown Documentation —
#@markdown [PromptTemplate](https://python.langchain.com/en/latest/modules/prompts/prompt_templates/examples/prompt_serialization.html),
#@markdown [LLMChain](https://python.langchain.com/en/latest/modules/chains/generic/llm_chain.html)

The response

# Function to calculate the sum of values in a column
#
# Function to compare the values of two columns
#
# Function to compare the values of two columns
#
# Function to compare the values of two columns
#
# Function to compare the values of two columns
#
# Function to compare the values of two columns
#
def get_sum_of_two_columns(df, column1, column2, new_column_name):
    df[new_column_name] = df[column1] + df[column2]
    return df

# Function to calculate the mean of a column
def calculate_mean_of_column(df, column_name):

