In [None]:
!pip install causal-conv1d>=1.2.0
!pip install mamba-ssm
!pip install git+https://github.com/huggingface/transformers@main
!pip install git+https://github.com/state-spaces/mamba.git
!pip install git+https://github.com/huggingface/accelerate

Collecting mamba-ssm
  Downloading mamba_ssm-1.2.0.post1.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from mamba-ssm)
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: mamba-ssm
  Building wheel for mamba-ssm (setup.py) ... [?25l[?25hdone
  Created wheel for mamba-ssm: filename=mamba_ssm-1.2.0.post1-cp310-cp310-linux_x86_64.whl size=137750683 sha256=b264292652a34fb9dd0ce880a34a4407ba7256a3338388d056769ec29a4581c9
  Stored in directory: /root/.cache/pip/wheels/22/6e/60/ddd5c574b5793a30028f2cabdacd2a3ec2276edaaa8c00fd35
Successfully built mamba-ssm
Installing collected packages: einops, mamba-ssm
Successfully installed einops-0.7.0 mamba-ssm-1.2.0.post1
Collecting git+https://github.com/huggingface/transformers@main
  Cloning https://github.com/huggingface/transformers (to revision mai

In [None]:
!pip install -r requirements.txt

Collecting accelerate==0.25.0 (from -r requirements.txt (line 1))
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp==3.9.1 (from -r requirements.txt (line 2))
  Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting attrs==23.1.0 (from -r requirements.txt (line 4))
  Downloading attrs-23.1.0-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Automat==22.10.0 (from -r requirements.txt (line 5))
  Downloading Automat-22.10.0-py2.py3-none-any.whl (26 kB)
Collecting buildtools==1.0.6 (from -r requirements.txt (line 6))
  Downloading buildtools-1.0.6.tar.gz (446 k

#Mamba 130M Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
import sys
import json
import pandas as pd
import time
from tqdm import tqdm

def run_mamba(model, question):
    n_shot_prompting = [
        {
            "question": "What is the capital of France?",
            "answer": "Paris"
        },
        {
            "question": "Who invented the segway?",
            "answer": "Dean Kamen"
        },
        {
            "question": "What is the fastest animal?",
            "answer": "Cheetah"
        }
    ]

    text = f"You are a Trivia QA bot.\nAnswer the following question succinctly and accurately."
    text = f"{text}\n\n" + "\n\n".join([f"Q: {p['question']}\nA: {p['answer']}" for p in n_shot_prompting])
    text = f"{text}\n\nQ: {question}\nA:"
    # print(text)
    input_ids = torch.LongTensor([tokenizer.encode(text)]).cuda()
    num_tokens = input_ids.shape[1]
    # print(input_ids)

    out = model.generate(
        input_ids=input_ids,
        max_length=128,
        eos_token_id=tokenizer.eos_token_id
    )

    # print(out)
    decoded = tokenizer.batch_decode(out)[0]
    # print("="*80)
    # print(decoded)

    # out returns the whole sequence plus the original
    cleaned = decoded.replace(text, "")

    # the model will just keep generating, so only grab the first one
    answer = cleaned.split("\n\n")[0].strip()
    # print(answer)
    return answer, num_tokens

def write_results(results, output_file):
    df = pd.DataFrame(results)
    df = df[["idx", "question", "answer", "guess", "is_correct", "time", "num_tokens", "tokens_per_sec"]]

    print(f"Writing {output_file}")
    df.to_json(output_file, orient="records", lines=True)

model = "state-spaces/mamba-130m"
dataset = "/content/squad_val_1k.jsonl"
output_file = "/content/mamba_130M_result_squad_val_1k.jsonl"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token

model = MambaLMHeadModel.from_pretrained(model, device="cuda", dtype=torch.float16)

mamba_130M_results = []
with open(dataset) as f:
    all_data = []
    for line in tqdm(f):
        data = json.loads(line)
        all_data.append(data)

    total_qs = len(all_data)
    for i, data in enumerate(all_data):
        start_time = time.time()

        # print(data)
        question = data["prompt"]
        answer = data["response"]
        guess, num_tokens = run_mamba(model, question)
        end_time = time.time()
        is_correct = (answer.strip().lower() == guess.strip().lower())
        print(f"Question {i}/{total_qs}")
        print(f"num tokens: {num_tokens}")
        print(f"Q: {question}")
        print(f"A: {answer}")
        print(f"?: {guess}")
        if is_correct:
            print(f"✅")
        else:
            print(f"❌")
        print("="*80)
        sys.stdout.flush()
        num_seconds = end_time - start_time
        tkps = num_tokens / num_seconds
        result = {
            "idx": i,
            "question": question,
            "answer": answer,
            "guess": guess,
            "is_correct": is_correct,
            "time": num_seconds,
            "num_tokens": num_tokens,
            "tokens_per_sec": tkps
        }
        mamba_130M_results.append(result)

        if len(mamba_130M_results) % 20 == 0:
            write_results(mamba_130M_results, output_file)

write_results(mamba_130M_results, output_file)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
1000it [00:00, 38145.29it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
num tokens: 86
Q: Who went to Wittenberg to hear Luther speak?
A: Students
?: Luther
❌
Question 293/1000
num tokens: 86
Q: What persons were not allowed to settle in New France?
A: non-Catholics
?: The French were not allowed to settle in New France.
❌
Question 294/1000
num tokens: 93
Q: What does the water flow of the Rhine merge with after flowing through Merwede?
A: Meuse
?: The water flows through Merwede, then flows through Rhine, then flows through Merwede, then flows through Rhine, then flows through Merwede, then
❌
Question 295/1000
num tokens: 91
Q: What was Tesla on his way to do when he was struck by the cab?
A: feed the pigeons
?: He was struck by the cab.
❌
Question 296/1000
num tokens: 94
Q: What lies between L and P that prevents a definitive determination of the relationship between L and P?
A: complexity classes
?: The distance between L and P is the distance between the two points of intersection of the 

In [None]:
total_correct = sum(result["is_correct"] for result in mamba_130M_results)
total_questions = len(mamba_130M_results)
accuracy_percentage = (total_correct / total_questions) * 100

total_time = sum(result["time"] for result in mamba_130M_results)
average_speed = total_time / total_questions

print(f"Total correct predictions: {total_correct}")
print(f"Total questions evaluated: {total_questions}")
print(f"Accuracy: {accuracy_percentage:.2f}%")
print(f"Average inference speed per question: {average_speed:.2f} seconds")

Total correct predictions: 5
Total questions evaluated: 1000
Accuracy: 0.50%
Average inference speed per question: 1.23 seconds


#Mamba 370M Model

In [None]:
def run_mamba(model, question):
    n_shot_prompting = [
        {
            "question": "What is the capital of France?",
            "answer": "Paris"
        },
        {
            "question": "Who invented the segway?",
            "answer": "Dean Kamen"
        },
        {
            "question": "What is the fastest animal?",
            "answer": "Cheetah"
        }
    ]

    text = f"You are a Trivia QA bot.\nAnswer the following question succinctly and accurately."
    text = f"{text}\n\n" + "\n\n".join([f"Q: {p['question']}\nA: {p['answer']}" for p in n_shot_prompting])
    text = f"{text}\n\nQ: {question}\nA:"
    # print(text)
    input_ids = torch.LongTensor([tokenizer.encode(text)]).cuda()
    num_tokens = input_ids.shape[1]
    # print(input_ids)

    out = model.generate(
        input_ids=input_ids,
        max_length=128,
        eos_token_id=tokenizer.eos_token_id
    )

    # print(out)
    decoded = tokenizer.batch_decode(out)[0]
    # print("="*80)
    # print(decoded)

    # out returns the whole sequence plus the original
    cleaned = decoded.replace(text, "")

    # the model will just keep generating, so only grab the first one
    answer = cleaned.split("\n\n")[0].strip()
    # print(answer)
    return answer, num_tokens

def write_results(results, output_file):
    df = pd.DataFrame(results)
    df = df[["idx", "question", "answer", "guess", "is_correct", "time", "num_tokens", "tokens_per_sec"]]

    print(f"Writing {output_file}")
    df.to_json(output_file, orient="records", lines=True)

model = "state-spaces/mamba-370m"
dataset = "/content/squad_val_1k.jsonl"
output_file = "/content/mamba_370M_result_squad_val_1k.jsonl"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token

model = MambaLMHeadModel.from_pretrained(model, device="cuda", dtype=torch.float16)

mamba_370M_results = []
with open(dataset) as f:
    all_data = []
    for line in tqdm(f):
        data = json.loads(line)
        all_data.append(data)

    total_qs = len(all_data)
    for i, data in enumerate(all_data):
        start_time = time.time()

        # print(data)
        question = data["prompt"]
        answer = data["response"]
        guess, num_tokens = run_mamba(model, question)
        end_time = time.time()
        is_correct = (answer.strip().lower() == guess.strip().lower())
        print(f"Question {i}/{total_qs}")
        print(f"num tokens: {num_tokens}")
        print(f"Q: {question}")
        print(f"A: {answer}")
        print(f"?: {guess}")
        if is_correct:
            print(f"✅")
        else:
            print(f"❌")
        print("="*80)
        sys.stdout.flush()
        num_seconds = end_time - start_time
        tkps = num_tokens / num_seconds
        result = {
            "idx": i,
            "question": question,
            "answer": answer,
            "guess": guess,
            "is_correct": is_correct,
            "time": num_seconds,
            "num_tokens": num_tokens,
            "tokens_per_sec": tkps
        }
        mamba_370M_results.append(result)

        if len(mamba_370M_results) % 20 == 0:
            write_results(mamba_370M_results, output_file)

write_results(mamba_370M_results, output_file)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.49G [00:00<?, ?B/s]

1000it [00:00, 80143.38it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Question 291/1000
num tokens: 89
Q: What has had a negative impact on the labor markets in the US?
A: decline of organized labor
?: The housing bubble
❌
Question 292/1000
num tokens: 86
Q: Who went to Wittenberg to hear Luther speak?
A: Students
?: Martin Luther
❌
Question 293/1000
num tokens: 86
Q: What persons were not allowed to settle in New France?
A: non-Catholics
?: The French, the Indians, and the English
❌
Question 294/1000
num tokens: 93
Q: What does the water flow of the Rhine merge with after flowing through Merwede?
A: Meuse
?: The Rhine
❌
Question 295/1000
num tokens: 91
Q: What was Tesla on his way to do when he was struck by the cab?
A: feed the pigeons
?: He was driving to the airport to pick up his wife.
❌
Question 296/1000
num tokens: 94
Q: What lies between L and P that prevents a definitive determination of the relationship between L and P?
A: complexity classes
?: The moon
❌
Question 297/1000
num tok

In [None]:
total_correct = sum(result["is_correct"] for result in mamba_370M_results)
total_questions = len(mamba_370M_results)
accuracy_percentage = (total_correct / total_questions) * 100

total_time = sum(result["time"] for result in mamba_370M_results)
average_speed = total_time / total_questions

print(f"Total correct predictions: {total_correct}")
print(f"Total questions evaluated: {total_questions}")
print(f"Accuracy: {accuracy_percentage:.2f}%")
print(f"Average inference speed per question: {average_speed:.2f} seconds")

Total correct predictions: 19
Total questions evaluated: 1000
Accuracy: 1.90%
Average inference speed per question: 2.21 seconds


#Mamba 790M Model

In [None]:
def run_mamba(model, question):
    n_shot_prompting = [
        {
            "question": "What is the capital of France?",
            "answer": "Paris"
        },
        {
            "question": "Who invented the segway?",
            "answer": "Dean Kamen"
        },
        {
            "question": "What is the fastest animal?",
            "answer": "Cheetah"
        }
    ]

    text = f"You are a Trivia QA bot.\nAnswer the following question succinctly and accurately."
    text = f"{text}\n\n" + "\n\n".join([f"Q: {p['question']}\nA: {p['answer']}" for p in n_shot_prompting])
    text = f"{text}\n\nQ: {question}\nA:"
    # print(text)
    input_ids = torch.LongTensor([tokenizer.encode(text)]).cuda()
    num_tokens = input_ids.shape[1]
    # print(input_ids)

    out = model.generate(
        input_ids=input_ids,
        max_length=128,
        eos_token_id=tokenizer.eos_token_id
    )

    # print(out)
    decoded = tokenizer.batch_decode(out)[0]
    # print("="*80)
    # print(decoded)

    # out returns the whole sequence plus the original
    cleaned = decoded.replace(text, "")

    # the model will just keep generating, so only grab the first one
    answer = cleaned.split("\n\n")[0].strip()
    # print(answer)
    return answer, num_tokens

def write_results(results, output_file):
    df = pd.DataFrame(results)
    df = df[["idx", "question", "answer", "guess", "is_correct", "time", "num_tokens", "tokens_per_sec"]]

    print(f"Writing {output_file}")
    df.to_json(output_file, orient="records", lines=True)

model = "state-spaces/mamba-790m"
dataset = "/content/squad_val_1k.jsonl"
output_file = "/content/mamba_790M_result_squad_val_1k.jsonl"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token

model = MambaLMHeadModel.from_pretrained(model, device="cuda", dtype=torch.float16)

mamba_790M_results = []
with open(dataset) as f:
    all_data = []
    for line in tqdm(f):
        data = json.loads(line)
        all_data.append(data)

    total_qs = len(all_data)
    for i, data in enumerate(all_data):
        start_time = time.time()

        # print(data)
        question = data["prompt"]
        answer = data["response"]
        guess, num_tokens = run_mamba(model, question)
        end_time = time.time()
        is_correct = (answer.strip().lower() == guess.strip().lower())
        print(f"Question {i}/{total_qs}")
        print(f"num tokens: {num_tokens}")
        print(f"Q: {question}")
        print(f"A: {answer}")
        print(f"?: {guess}")
        if is_correct:
            print(f"✅")
        else:
            print(f"❌")
        print("="*80)
        sys.stdout.flush()
        num_seconds = end_time - start_time
        tkps = num_tokens / num_seconds
        result = {
            "idx": i,
            "question": question,
            "answer": answer,
            "guess": guess,
            "is_correct": is_correct,
            "time": num_seconds,
            "num_tokens": num_tokens,
            "tokens_per_sec": tkps
        }
        mamba_790M_results.append(result)

        if len(mamba_790M_results) % 20 == 0:
            write_results(mamba_790M_results, output_file)

write_results(mamba_790M_results, output_file)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.17G [00:00<?, ?B/s]

1000it [00:00, 71937.30it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
A: decline of organized labor
?: The Great Recession
❌
Question 292/1000
num tokens: 86
Q: Who went to Wittenberg to hear Luther speak?
A: Students
?: Martin Luther
❌
Question 293/1000
num tokens: 86
Q: What persons were not allowed to settle in New France?
A: non-Catholics
?: The French, the Indians, and the English
❌
Question 294/1000
num tokens: 93
Q: What does the water flow of the Rhine merge with after flowing through Merwede?
A: Meuse
?: The Rhine
❌
Question 295/1000
num tokens: 91
Q: What was Tesla on his way to do when he was struck by the cab?
A: feed the pigeons
?: He was going to the airport
❌
Question 296/1000
num tokens: 94
Q: What lies between L and P that prevents a definitive determination of the relationship between L and P?
A: complexity classes
?: The L-P-L-P-L-P-L-P-L-P-L-P-L-P-L-P-L
❌
Question 297/1000
num tokens: 87
Q: In what decade were injectors widely used in steam engines?
A: 1850s
?: 18th cent

In [None]:
total_correct = sum(result["is_correct"] for result in mamba_790M_results)
total_questions = len(mamba_790M_results)
accuracy_percentage = (total_correct / total_questions) * 100

total_time = sum(result["time"] for result in mamba_790M_results)
average_speed = total_time / total_questions

print(f"Total correct predictions: {total_correct}")
print(f"Total questions evaluated: {total_questions}")
print(f"Accuracy: {accuracy_percentage:.2f}%")
print(f"Average inference speed per question: {average_speed:.2f} seconds")

Total correct predictions: 34
Total questions evaluated: 1000
Accuracy: 3.40%
Average inference speed per question: 2.21 seconds


#Mamba 1.4B Model

In [None]:
def run_mamba(model, question):
    n_shot_prompting = [
        {
            "question": "What is the capital of France?",
            "answer": "Paris"
        },
        {
            "question": "Who invented the segway?",
            "answer": "Dean Kamen"
        },
        {
            "question": "What is the fastest animal?",
            "answer": "Cheetah"
        }
    ]

    text = f"You are a Trivia QA bot.\nAnswer the following question succinctly and accurately."
    text = f"{text}\n\n" + "\n\n".join([f"Q: {p['question']}\nA: {p['answer']}" for p in n_shot_prompting])
    text = f"{text}\n\nQ: {question}\nA:"
    # print(text)
    input_ids = torch.LongTensor([tokenizer.encode(text)]).cuda()
    num_tokens = input_ids.shape[1]
    # print(input_ids)

    out = model.generate(
        input_ids=input_ids,
        max_length=128,
        eos_token_id=tokenizer.eos_token_id
    )

    # print(out)
    decoded = tokenizer.batch_decode(out)[0]
    # print("="*80)
    # print(decoded)

    # out returns the whole sequence plus the original
    cleaned = decoded.replace(text, "")

    # the model will just keep generating, so only grab the first one
    answer = cleaned.split("\n\n")[0].strip()
    # print(answer)
    return answer, num_tokens

def write_results(results, output_file):
    df = pd.DataFrame(results)
    df = df[["idx", "question", "answer", "guess", "is_correct", "time", "num_tokens", "tokens_per_sec"]]

    print(f"Writing {output_file}")
    df.to_json(output_file, orient="records", lines=True)

model = "state-spaces/mamba-1.4b"
dataset = "/content/squad_val_1k.jsonl"
output_file = "/content/mamba_1.4B_result_squad_val_1k.jsonl"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token

model = MambaLMHeadModel.from_pretrained(model, device="cuda", dtype=torch.float16)

mamba_14B_results = []
with open(dataset) as f:
    all_data = []
    for line in tqdm(f):
        data = json.loads(line)
        all_data.append(data)

    total_qs = len(all_data)
    for i, data in enumerate(all_data):
        start_time = time.time()

        # print(data)
        question = data["prompt"]
        answer = data["response"]
        guess, num_tokens = run_mamba(model, question)
        end_time = time.time()
        is_correct = (answer.strip().lower() == guess.strip().lower())
        print(f"Question {i}/{total_qs}")
        print(f"num tokens: {num_tokens}")
        print(f"Q: {question}")
        print(f"A: {answer}")
        print(f"?: {guess}")
        if is_correct:
            print(f"✅")
        else:
            print(f"❌")
        print("="*80)
        sys.stdout.flush()
        num_seconds = end_time - start_time
        tkps = num_tokens / num_seconds
        result = {
            "idx": i,
            "question": question,
            "answer": answer,
            "guess": guess,
            "is_correct": is_correct,
            "time": num_seconds,
            "num_tokens": num_tokens,
            "tokens_per_sec": tkps
        }
        mamba_14B_results.append(result)

        if len(mamba_14B_results) % 20 == 0:
            write_results(mamba_14B_results, output_file)

write_results(mamba_14B_results, output_file)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
1000it [00:00, 93903.73it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Question 291/1000
num tokens: 89
Q: What has had a negative impact on the labor markets in the US?
A: decline of organized labor
?: The Great Depression
❌
Question 292/1000
num tokens: 86
Q: Who went to Wittenberg to hear Luther speak?
A: Students
?: Martin Luther
❌
Question 293/1000
num tokens: 86
Q: What persons were not allowed to settle in New France?
A: non-Catholics
?: The French
❌
Question 294/1000
num tokens: 93
Q: What does the water flow of the Rhine merge with after flowing through Merwede?
A: Meuse
?: The Rhine
❌
Question 295/1000
num tokens: 91
Q: What was Tesla on his way to do when he was struck by the cab?
A: feed the pigeons
?: He was going to buy a new car
❌
Question 296/1000
num tokens: 94
Q: What lies between L and P that prevents a definitive determination of the relationship between L and P?
A: complexity classes
?: The letter L
❌
Question 297/1000
num tokens: 87
Q: In what decade were injectors wide

In [None]:
total_correct = sum(result["is_correct"] for result in mamba_14B_results)
total_questions = len(mamba_14B_results)
accuracy_percentage = (total_correct / total_questions) * 100

total_time = sum(result["time"] for result in mamba_14B_results)
average_speed = total_time / total_questions

print(f"Total correct predictions: {total_correct}")
print(f"Total questions evaluated: {total_questions}")
print(f"Accuracy: {accuracy_percentage:.2f}%")
print(f"Average inference speed per question: {average_speed:.2f} seconds")

Total correct predictions: 44
Total questions evaluated: 1000
Accuracy: 4.40%
Average inference speed per question: 2.09 seconds


#Mamba 2.8B Model

In [None]:
def run_mamba(model, question):
    n_shot_prompting = [
        {
            "question": "What is the capital of France?",
            "answer": "Paris"
        },
        {
            "question": "Who invented the segway?",
            "answer": "Dean Kamen"
        },
        {
            "question": "What is the fastest animal?",
            "answer": "Cheetah"
        }
    ]

    text = f"You are a Trivia QA bot.\nAnswer the following question succinctly and accurately."
    text = f"{text}\n\n" + "\n\n".join([f"Q: {p['question']}\nA: {p['answer']}" for p in n_shot_prompting])
    text = f"{text}\n\nQ: {question}\nA:"
    # print(text)
    input_ids = torch.LongTensor([tokenizer.encode(text)]).cuda()
    num_tokens = input_ids.shape[1]
    # print(input_ids)

    out = model.generate(
        input_ids=input_ids,
        max_length=128,
        eos_token_id=tokenizer.eos_token_id
    )

    # print(out)
    decoded = tokenizer.batch_decode(out)[0]
    # print("="*80)
    # print(decoded)

    # out returns the whole sequence plus the original
    cleaned = decoded.replace(text, "")

    # the model will just keep generating, so only grab the first one
    answer = cleaned.split("\n\n")[0].strip()
    # print(answer)
    return answer, num_tokens

def write_results(results, output_file):
    df = pd.DataFrame(results)
    df = df[["idx", "question", "answer", "guess", "is_correct", "time", "num_tokens", "tokens_per_sec"]]

    print(f"Writing {output_file}")
    df.to_json(output_file, orient="records", lines=True)

model = "state-spaces/mamba-2.8b"
dataset = "/content/squad_val_1k.jsonl"
output_file = "/content/mamba_2.8B_result_squad_val_1k.jsonl"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token

model = MambaLMHeadModel.from_pretrained(model, device="cuda", dtype=torch.float16)

mamba_28B_results = []
with open(dataset) as f:
    all_data = []
    for line in tqdm(f):
        data = json.loads(line)
        all_data.append(data)

    total_qs = len(all_data)
    for i, data in enumerate(all_data):
        start_time = time.time()

        # print(data)
        question = data["prompt"]
        answer = data["response"]
        guess, num_tokens = run_mamba(model, question)
        end_time = time.time()
        is_correct = (answer.strip().lower() == guess.strip().lower())
        print(f"Question {i}/{total_qs}")
        print(f"num tokens: {num_tokens}")
        print(f"Q: {question}")
        print(f"A: {answer}")
        print(f"?: {guess}")
        if is_correct:
            print(f"✅")
        else:
            print(f"❌")
        print("="*80)
        sys.stdout.flush()
        num_seconds = end_time - start_time
        tkps = num_tokens / num_seconds
        result = {
            "idx": i,
            "question": question,
            "answer": answer,
            "guess": guess,
            "is_correct": is_correct,
            "time": num_seconds,
            "num_tokens": num_tokens,
            "tokens_per_sec": tkps
        }
        mamba_28B_results.append(result)

        if len(mamba_28B_results) % 20 == 0:
            write_results(mamba_28B_results, output_file)

write_results(mamba_28B_results, output_file)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/457k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/11.1G [00:00<?, ?B/s]

1000it [00:00, 36695.57it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
?: The Great Recession
❌
Question 292/1000
num tokens: 86
Q: Who went to Wittenberg to hear Luther speak?
A: Students
?: Martin Luther
❌
Question 293/1000
num tokens: 86
Q: What persons were not allowed to settle in New France?
A: non-Catholics
?: The Acadians
❌
Question 294/1000
num tokens: 93
Q: What does the water flow of the Rhine merge with after flowing through Merwede?
A: Meuse
?: Rhine
❌
Question 295/1000
num tokens: 91
Q: What was Tesla on his way to do when he was struck by the cab?
A: feed the pigeons
?: He was on his way to the White House to meet with the President.
❌
Question 296/1000
num tokens: 94
Q: What lies between L and P that prevents a definitive determination of the relationship between L and P?
A: complexity classes
?: L
❌
Question 297/1000
num tokens: 87
Q: In what decade were injectors widely used in steam engines?
A: 1850s
?: 1840s
❌
Question 298/1000
num tokens: 90
Q: What kind of non-peer-revi

In [None]:
total_correct = sum(result["is_correct"] for result in mamba_28B_results)
total_questions = len(mamba_28B_results)
accuracy_percentage = (total_correct / total_questions) * 100

total_time = sum(result["time"] for result in mamba_28B_results)
average_speed = total_time / total_questions

print(f"Total correct predictions: {total_correct}")
print(f"Total questions evaluated: {total_questions}")
print(f"Accuracy: {accuracy_percentage:.2f}%")
print(f"Average inference speed per question: {average_speed:.2f} seconds")

Total correct predictions: 54
Total questions evaluated: 1000
Accuracy: 5.40%
Average inference speed per question: 2.75 seconds
