In [None]:
!pip install transformers datasets torch ipywidgets sentence_transformers matplotlib nltk bitsandbytes accelerate

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
print(torch.cuda.is_available())

In [None]:
import pickle

In [None]:
!nvidia-smi

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

In [None]:
from huggingface_hub import login

login()

In [None]:
import time
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

#### Models

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

Llama7b

In [None]:
model_name = 'meta-llama/Llama-2-7b-chat-hf'

llama7b_tokenizer = AutoTokenizer.from_pretrained(model_name)
llama7b = AutoModelForCausalLM.from_pretrained(model_name,
                                               torch_dtype=torch.float16).to("cuda")

TinyLlama

In [None]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

tinyllama_tokenizer = AutoTokenizer.from_pretrained(model_name)
tinyllama = AutoModelForCausalLM.from_pretrained(model_name,
                                                 torch_dtype=torch.float16).to("cuda")

Llama13b

In [None]:
llama13_config = BitsAndBytesConfig(load_in_4bit=True,
                                    bnb_4bit_compute_dtype=torch.float16)

In [None]:
model_name = 'meta-llama/Llama-2-13b-chat-hf'

llama13b_tokenizer = AutoTokenizer.from_pretrained(model_name)
llama13b = AutoModelForCausalLM.from_pretrained(model_name,
                                                device_map='auto',
                                                quantization_config=llama13_config)

### Basic Model Loading + Inference

#### Llama 7b

In [None]:
start_time = time.time()

input_text = "Once upon a time in a land far, far away"
inputs = llama7b_tokenizer(input_text, return_tensors="pt").to("cuda")

output = llama7b.generate(inputs['input_ids'], max_length=100)

output_text = llama7b_tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)

execution_time = time.time() - start_time
print()
print(f"Execution time: {execution_time} seconds")

#### TinyLlama

In [None]:
start_time = time.time()

input_text = "Once upon a time in a land far, far away"
inputs = tinyllama_tokenizer(input_text, return_tensors="pt").to("cuda")

output = tinyllama.generate(inputs['input_ids'], max_length=100)

output_text = tinyllama_tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)

execution_time = time.time() - start_time
print()
print(f"Execution time: {execution_time} seconds")

#### Llama 13b

In [None]:
start_time = time.time()

input_text = "Once upon a time in a land far, far away"
inputs = llama13b_tokenizer(input_text, return_tensors="pt").to("cuda")

output = llama13b.generate(inputs['input_ids'], max_length=100)

output_text = llama13b_tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)

execution_time = time.time() - start_time
print()
print(f"Execution time: {execution_time} seconds")

In [None]:
torch.cuda.empty_cache()

### **WMT 2014 (Machine Translation)** 
is a collection of datasets used in shared tasks of the Ninth Workshop on Statistical Machine Translation

In [None]:
from datasets import load_dataset

wmt14_dataset = load_dataset('wmt14', 'de-en', split='test')

#### Example inference

In [None]:
input_text = wmt14_dataset[0]['translation']['en']  
input_prompt = f"Translate to English: {input_text}"

inputs = llama7b_tokenizer(input_prompt, return_tensors="pt")

In [None]:
output = llama7b.generate(inputs['input_ids'], max_length=50)

output_text = llama7b_tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)

#### Llama 7b

In [None]:
num_examples = 5 

state_time = time.time()
for i in range(num_examples):
    input_text = wmt14_dataset[i]['translation']['de']
    input_prompt = f"Translate to English: {input_text}"
    
    inputs = llama7b_tokenizer(input_prompt, return_tensors="pt", truncation=True, max_length=50)
    output = llama7b.generate(inputs['input_ids'], max_new_tokens=50)
    output_text = llama7b_tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Input: {input_prompt}")
    print(f"Output: {output_text}")
    print()

execution_time = time.time() - start_time
print()
print(f"Execution time: {execution_time} seconds")

In [None]:
torch.cuda.empty_cache()

#### Tiny Llama

In [None]:
num_examples = 5 

state_time = time.time()
for i in range(num_examples):
    input_text = wmt14_dataset[i]['translation']['de']
    input_prompt = f"Translate to English: {input_text}"
    
    inputs = tinyllama_tokenizer(input_prompt, return_tensors="pt", truncation=True, max_length=50)
    output = tinyllama.generate(inputs['input_ids'], max_new_tokens=50)
    output_text = tinyllama_tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Input: {input_prompt}")
    print(f"Output: {output_text}")
    print()

execution_time = time.time() - start_time
print()
print(f"Execution time: {execution_time} seconds")

In [None]:
torch.cuda.empty_cache()

#### Llama 13b

In [None]:
num_examples = 5 

state_time = time.time()
for i in range(num_examples):
    input_text = wmt14_dataset[i]['translation']['de']
    input_prompt = f"Translate to English: {input_text}"
    
    inputs = llama13b_tokenizer(input_prompt, return_tensors="pt", truncation=True, max_length=50)
    output = llama13b.generate(inputs['input_ids'], max_new_tokens=50)
    output_text = llama13b_tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Input: {input_prompt}")
    print(f"Output: {output_text}")
    print()

execution_time = time.time() - start_time
print()
print(f"Execution time: {execution_time} seconds")

In [None]:
torch.cuda.empty_cache()

#### Functionalize Inference

In [None]:
token_ranges = {
    '0-50': 0,
    '51-100': 0,
    '101-150': 0
}

max_tokens = -1

for idx, data in enumerate(wmt14_dataset):
    input_text = data['translation']['de']
    tokens = llama13b_tokenizer(input_text, return_tensors="pt")
    num_tokens = len(tokens['input_ids'][0])
    
    if num_tokens > max_tokens:
        max_tokens = num_tokens
        max_tokens_idx = idx
    
    if num_tokens <= 50:
        token_ranges['0-50'] += 1
    elif num_tokens <= 100:
        token_ranges['51-100'] += 1
    elif num_tokens <= 150:
        token_ranges['101-150'] += 1

print("Number of data points in different token ranges:")
for key, value in token_ranges.items():
    print(f"{key}: {value}")

print(f"\nData point with the most tokens is at index: {max_tokens_idx}")
print(f"Number of tokens: {max_tokens}")
print(f"Input text: {wmt14_dataset[max_tokens_idx]['translation']['de']}")

In [None]:
def generate_output(model, tokenizer, dataset, current_idx):
    outputs = []
    
    input_text = wmt14_dataset[current_idx]['translation']['de']
    input_prompt = "Translate the sentence from German to English: \n\n" + input_text + "\n\n Write the translation here: "

    inputs = tokenizer(input_prompt, return_tensors="pt", truncation=True).to("cuda")
    output = model.generate(inputs['input_ids'])
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    answer_prefix = "Write the translation here: "
    if answer_prefix in output_text:
        cleaned_output = output_text.split(answer_prefix)[-1].strip()
    else:
        cleaned_output = output_text.strip()

    first_sentence = cleaned_output.split('.')[0] + '.' if '.' in cleaned_output else cleaned_output
    outputs.append(first_sentence)
    
    return outputs

## Inference 

In [None]:
input_texts = []
outputs_7b = []
outputs_tiny = []
outputs_13b = []

In [None]:
for current_idx in range(396, 1000):
    input_text = wmt14_dataset[current_idx]['translation']['de']
    output_7b = generate_output(llama7b, llama7b_tokenizer, input_text, current_idx)

    outputs_7b.append(output_7b)
    
    print(f"Llama-7b | CURRENT IDX: {current_idx} | Length: {len(outputs_7b)}")
    # with open('input_output_pairs_wmt14_7b', 'wb') as f:
    #     pickle.dump(outputs_7b, f)

In [None]:
with open('input_output_pairs_wmt14_7b', 'rb') as f:
    outputs_7b = pickle.load(f)

print(len(outputs_7b))
print(outputs_7b[998:1000])

In [None]:
for current_idx in range(0, 1000):
    input_text = wmt14_dataset[current_idx]['translation']['de']
    output_tiny = generate_output(tinyllama, tinyllama_tokenizer, input_text, current_idx)

    outputs_tiny.append(output_tiny)
    
    print(f"TinyLlama | CURRENT IDX: {current_idx} | Length: {len(outputs_tiny)}")
    # with open('input_output_pairs_wmt14_tiny', 'wb') as f:
    #     pickle.dump(outputs_tiny, f)

In [None]:
with open('input_output_pairs_wmt14_tiny', 'rb') as f:
    outputs_tiny = pickle.load(f)

print(len(outputs_tiny))
print(outputs_tiny[:10])

In [None]:
for current_idx in range(980, 1000):
    input_text = wmt14_dataset[current_idx]['translation']['de']
    output_13b = generate_output(llama13b, llama13b_tokenizer, input_text, current_idx)

    outputs_13b.append(output_13b)
    
    print(f"Llama13b | CURRENT IDX: {current_idx} | Length: {len(outputs_13b)}")
    # with open('input_output_pairs_wmt14_13b', 'wb') as f:
    #     pickle.dump(outputs_13b, f)

In [None]:
with open('input_output_pairs_wmt14_13b', 'rb') as f:
    outputs_13b = pickle.load(f)

print(len(outputs_13b))
print(outputs_13b[999])

In [None]:
input_output_pairs = []

In [None]:
wmt14_dataset[500]['translation']['de']

In [None]:
for idx in range(len(outputs_7b)):
    outputs = {
        'input': wmt14_dataset[idx]['translation']['de'],
        'output_7b': outputs_7b[idx],
        'output_tiny': outputs_tiny[idx],
        'output_13b': outputs_13b[idx]
    }
    
    input_output_pairs.append(outputs)

# with open('input_output_pairs_wmt14.pkl', 'wb') as f:
#     pickle.dump(input_output_pairs, f)

In [None]:
with open('input_output_pairs_wmt14.pkl', 'rb') as f:
    input_output_pairs = pickle.load(f)

len(input_output_pairs)

In [None]:
print(input_output_pairs[:100])

### **CNN_Dailymail (Summarization)**
is an English-language dataset containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail

In [None]:
from datasets import load_dataset

cnn_dailymail_dataset = load_dataset('abisee/cnn_dailymail', '2.0.0', split='test')

In [None]:
input_text = cnn_dailymail_dataset[100]['article'] 
input_prompt = "Summarize the following text in under 50 words: \n\n" + input_text + "\n\n Write the summary here: "

inputs = tinyllama_tokenizer(input_prompt, return_tensors="pt", truncation=True).to("cuda")

In [None]:
output = tinyllama.generate(inputs['input_ids'], max_new_tokens=100)

output_text = tinyllama_tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Input: {input_prompt}")
print(f"Output: {output_text}")

In [None]:
summary_prefix = "Write the summary here: "
if summary_prefix in output_text:
    cleaned_output = output_text.split(summary_prefix)[-1].strip()
else:
    cleaned_output = output_text.strip()

print(cleaned_output)

In [None]:
token_ranges = {
    '0-100': 0,
    '101-200': 0,
    '201-300': 0,
    '301-400': 0,
    '401-500': 0,
    '501-600': 0,
    '601-700': 0,
    '701-800': 0,
    '801-900': 0,
    '901-1000': 0,
    '1001+': 0
}

max_tokens = -1
max_tokens_idx = -1

for idx, data in enumerate(cnn_dailymail_dataset):
    input_text = data['article']
    tokens = llama7b_tokenizer(input_text, return_tensors="pt")
    num_tokens = len(tokens['input_ids'][0])
    
    if num_tokens > max_tokens:
        max_tokens = num_tokens
        max_tokens_idx = idx
    
    if num_tokens <= 100:
        token_ranges['0-100'] += 1
    elif num_tokens <= 200:
        token_ranges['101-200'] += 1
    elif num_tokens <= 300:
        token_ranges['201-300'] += 1
    elif num_tokens <= 400:
        token_ranges['301-400'] += 1
    elif num_tokens <= 500:
        token_ranges['401-500'] += 1
    elif num_tokens <= 600:
        token_ranges['501-600'] += 1
    elif num_tokens <= 700:
        token_ranges['601-700'] += 1
    elif num_tokens <= 800:
        token_ranges['701-800'] += 1
    elif num_tokens <= 900:
        token_ranges['801-900'] += 1
    elif num_tokens <= 1000:
        token_ranges['901-1000'] += 1
    else:
        token_ranges['1001+'] += 1

print("Number of data points in different token ranges:")
for key, value in token_ranges.items():
    print(f"{key}: {value}")

print(f"\nData point with the most tokens is at index: {max_tokens_idx}")
print(f"Number of tokens: {max_tokens}")
print(f"Input text: {cnn_dailymail_dataset[max_tokens_idx]['article']}")

In [None]:
def generate_output(model, tokenizer, dataset, current_idx):
    outputs = []
    
    input_text = cnn_dailymail_dataset[current_idx]['article'] 
    input_prompt = "Summarize the following text in under 50 words: \n\n" + input_text + "\n\n Write the summary here: "
    
    #inputs = tokenizer(input_prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda")
    #output = model.generate(inputs['input_ids'], max_new_tokens=2048)
    inputs = tokenizer(input_prompt, return_tensors="pt", truncation=True).to("cuda")
    output = model.generate(inputs['input_ids'], max_new_tokens=100)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    summary_prefix = "Write the summary here: "
    if summary_prefix in output_text:
        cleaned_output = output_text.split(summary_prefix)[-1].strip()
    else:
        cleaned_output = output_text.strip()

    outputs.append(cleaned_output)
    
    return outputs

In [None]:
input_texts = []
outputs_7b = []
outputs_tiny = []
outputs_13b = []

In [None]:
# for current_idx in range(0, 1000):
#     input_text = cnn_dailymail_dataset[current_idx]['article']
#     output_7b = generate_output(llama7b, llama7b_tokenizer, input_text, current_idx)

#     outputs_7b.append(output_7b)
    
#     print(f"Llama-7b | CURRENT IDX: {current_idx} | Length: {len(outputs_7b)}")
#     with open('input_output_pairs_cnn_dailymail_7b', 'wb') as f:
#         pickle.dump(outputs_7b, f)

In [None]:
with open('input_output_pairs_cnn_dailymail_7b', 'rb') as f:
    outputs_7b = pickle.load(f)

print(len(outputs_7b))
# print(outputs_7b)

In [None]:
# for current_idx in range(0, 1000):
#     input_text = cnn_dailymail_dataset[current_idx]['article']
#     output_tiny = generate_output(tinyllama, tinyllama_tokenizer, input_text, current_idx)
    
#     outputs_tiny.append(output_tiny)
    
#     print(f"TinyLlama | CURRENT IDX: {current_idx} | Length: {len(outputs_tiny)}")
#     with open('input_output_pairs_cnn_dailymail_tinyllama', 'wb') as f:
#         pickle.dump(outputs_tiny, f)

In [None]:
with open('input_output_pairs_cnn_dailymail_tinyllama', 'rb') as f:
    outputs_tiny = pickle.load(f)

print(len(outputs_tiny))
# print(outputs_tiny)

In [None]:
# for current_idx in range(604, 1000):
#     input_text = cnn_dailymail_dataset[current_idx]['article']
#     output_13b = generate_output(llama13b, llama13b_tokenizer, input_text, current_idx)
    
#     outputs_13b.append(output_13b)
    
#     print(f"Llama-13b | CURRENT IDX: {current_idx} | Length: {len(outputs_13b)}")
#     with open('input_output_pairs_cnn_dailymail_13b.pkl', 'wb') as f:
#         pickle.dump(outputs_13b, f)

In [None]:
with open('input_output_pairs_cnn_dailymail_13b.pkl', 'rb') as f:
    outputs_13b = pickle.load(f)

print(len(outputs_13b))
# print(outputs_13b)

In [None]:
!nvidia-smi

In [None]:
input_output_pairs = []

In [None]:
# for idx in range(len(cnn_dailymail_dataset)):
#     outputs = {
#         'input': cnn_dailymail_dataset[idx]["article"],
#         'output_7b': outputs_7b[idx],
#         'output_tiny': outputs_tiny[idx],
#         'output_13b': outputs_13b[idx]
#     }
    
#     input_output_pairs.append(outputs)
    
#     print("---------------------------------------------------------------------------")
#     print(f"CURRENT IDX: {idx}")
#     print(f"Length: {len(input_output_pairs)}")
#     # print(f"Current Dataset: {input_output_pairs[-1]}")
#     with open('input_output_pairs_cnn_dailymail.pkl', 'wb') as f:
#         pickle.dump(input_output_pairs, f)
#     print("---------------------------------------------------------------------------")

### **GSM8K (Math)**
is a dataset of 8.5K high quality linguistically diverse grade school math word problems. The dataset was created to support the task of question answering on basic mathematical problems that require multi-step reasoning.

In [None]:
from datasets import load_dataset

gsm8k_dataset = load_dataset('openai/gsm8k', 'main', split='train')

In [None]:
input_text = gsm8k_dataset[0]['question'] 
input_prompt = "Answer the following math question: \n\n" + input_text + "\n\n Lets think step by step: "

inputs = tinyllama_tokenizer(input_prompt, return_tensors="pt").to("cuda")

In [None]:
output = tinyllama.generate(inputs['input_ids'])

output_text = tinyllama_tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Input: {input_prompt}")
print(f"Output: {output_text}")

In [None]:
answer_prefix = "Lets think step by step: "
if answer_prefix in output_text:
    cleaned_output = output_text.split(answer_prefix)[-1].strip()
else:
    cleaned_output = output_text.strip()

print(cleaned_output)

In [None]:
token_ranges = {
    '0-50': 0,
    '51-100': 0,
    '101-150': 0,
    '151-200': 0,
    '201-250': 0
}

max_tokens = -1

for idx, data in enumerate(gsm8k_dataset):
    input_text = data['question']
    tokens = llama7b_tokenizer(input_text, return_tensors="pt")
    num_tokens = len(tokens['input_ids'][0])
    
    if num_tokens > max_tokens:
        max_tokens = num_tokens
        max_tokens_idx = idx
    
    if num_tokens <= 50:
        token_ranges['0-50'] += 1
    elif num_tokens <= 100:
        token_ranges['51-100'] += 1
    elif num_tokens <= 150:
        token_ranges['101-150'] += 1
    elif num_tokens <= 200:
        token_ranges['151-200'] += 1
    elif num_tokens <= 250:
        token_ranges['201-250'] += 1

print("Number of data points in different token ranges:")
for key, value in token_ranges.items():
    print(f"{key}: {value}")

print(f"\nData point with the most tokens is at index: {max_tokens_idx}")
print(f"Number of tokens: {max_tokens}")
print(f"Input text: {gsm8k_dataset[max_tokens_idx]['question']}")

In [None]:
def generate_output(model, tokenizer, dataset, current_idx):
    outputs = []
    
    input_text = gsm8k_dataset[current_idx]['question'] 
    input_prompt = "Answer the following math question: \n\n" + input_text + "\n\n Lets think step by step: "

    inputs = tokenizer(input_prompt, return_tensors="pt", truncation=True).to("cuda")
    output = model.generate(inputs['input_ids'])
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    answer_prefix = "Lets think step by step: "
    if answer_prefix in output_text:
        cleaned_output = output_text.split(answer_prefix)[-1].strip()
    else:
        cleaned_output = output_text.strip()

    outputs.append(cleaned_output)
    
    return outputs

In [None]:
input_texts = []
# outputs_7b = []
# outputs_tiny = []
# outputs_13b = []

In [None]:
for current_idx in range(948, 1000):
    input_text = gsm8k_dataset[current_idx]['question']
    output_7b = generate_output(llama7b, llama7b_tokenizer, input_text, current_idx)

    outputs_7b.append(output_7b)
    
    print(f"Llama-7b | CURRENT IDX: {current_idx} | Length: {len(outputs_7b)}")
    # with open('input_output_pairs_gsm8k_7b', 'wb') as f:
    #     pickle.dump(outputs_7b, f)

In [None]:
for current_idx in range(957, 1000):
    input_text = gsm8k_dataset[current_idx]['question']
    output_tiny = generate_output(tinyllama, tinyllama_tokenizer, input_text, current_idx)

    outputs_tiny.append(output_tiny)
    
    print(f"TinyLlama | CURRENT IDX: {current_idx} | Length: {len(outputs_tiny)}")
    # with open('input_output_pairs_gsm8k_tiny', 'wb') as f:
    #     pickle.dump(outputs_tiny, f)

In [None]:
with open('input_output_pairs_gsm8k_tiny', 'rb') as f:
    outputs_tiny = pickle.load(f)

print(len(outputs_tiny))
print(outputs_tiny[999])

In [None]:
for current_idx in range(899, 1000):
    input_text = gsm8k_dataset[current_idx]['question']
    output_13b = generate_output(llama13b, llama13b_tokenizer, input_text, current_idx)

    outputs_13b.append(output_13b)
    
    print(f"Llama13b | CURRENT IDX: {current_idx} | Length: {len(outputs_13b)}")
    # with open('input_output_pairs_gsm8k_13b', 'wb') as f:
    #     pickle.dump(outputs_13b, f)

In [None]:
with open('input_output_pairs_gsm8k_13b', 'rb') as f:
    outputs_13b = pickle.load(f)

print(len(outputs_13b))
print(outputs_13b[831])

In [None]:
print(len(outputs_7b))
print(len(outputs_tiny))
print(len(outputs_13b))

Remove the \n from the 3 output arrays

In [None]:
outputs_7b = [[output.replace('\n', ' ') for output in output_list] for output_list in outputs_7b]
outputs_tiny = [[output.replace('\n', ' ') for output in output_list] for output_list in outputs_tiny]
outputs_13b = [[output.replace('\n', ' ') for output in output_list] for output_list in outputs_13b]

# with open('input_output_pairs_gsm8k_7b', 'wb') as f:
#     pickle.dump(outputs_7b, f)
# with open('input_output_pairs_gsm8k_tiny', 'wb') as f:
#     pickle.dump(outputs_tiny, f)
# with open('input_output_pairs_gsm8k_13b', 'wb') as f:
#     pickle.dump(outputs_13b, f)

In [None]:
input_output_pairs = []

In [None]:
for idx in range(len(outputs_7b)):
    outputs = {
        'input': gsm8k_dataset[idx]["question"],
        'output_7b': outputs_7b[idx],
        'output_tiny': outputs_tiny[idx],
        'output_13b': outputs_13b[idx]
    }
    
    input_output_pairs.append(outputs)

# with open('input_output_pairs_gsm8k.pkl', 'wb') as f:
#     pickle.dump(input_output_pairs, f)

In [None]:
len(input_output_pairs)