In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, Trainer, TrainingArguments, logging
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model
from huggingface_hub import notebook_login
import torch.nn as nn
from tools import *
import torch
import os

In [2]:
device = "cuda"
checkpoint = "mistralai/Mistral-7B-v0.1"

config = AutoConfig.from_pretrained(checkpoint)
config.update({'sliding_window' : 8_192}) 
config.update({'rope_scaling' : {"type": "yarn",
                                 "factor": 2, 
                                 "original_max_position_embeddings": 8192,
                                 "finetuned": True,
                                }})  


tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = False, revision = 'main')
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                            low_cpu_mem_usage = True,
                                            torch_dtype = torch.float16,
                                            revision = 'main',
                                            device_map = 'auto',
                                            use_flash_attention_2 = True,
                                            config = config,)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
lora_r_default = 8
lora_alpha_default = 32
lora_dropout_default = 0.05

lora_config = LoraConfig(
        r=lora_r_default, 
        lora_alpha=lora_alpha_default, 
        lora_dropout=lora_dropout_default,
        bias="none", 
        task_type="CAUSAL_LM",  
        target_modules = ["q_proj", "k_proj", "v_proj"],
        )

model.enable_input_require_grads()
model = get_peft_model(model, lora_config)

In [17]:
path = './model_weights/Mistral-7B-v0.1-context_extension-stage1/checkpoint_400.pt'

def load_weights(model, path):
    saved_weights = torch.load(path)
    param_count = 0
    for key, val in saved_weights['model_state_dict'].items():
        for name, param in model.named_parameters():
            if key == name:
                param.data = val.data
                param_count += val.numel()
                break
    
    print(f'{param_count:,} parameters were loaded successfully.')

load_weights(model, path)

In [33]:
for n,p in model.named_parameters():
    p.data = p.data.to(torch.float16)

In [51]:
model.eval()
prompt = 'GPT-4 is '
inputs = tokenizer(prompt, return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=200)
    print(tokenizer.decode(outputs[0, inputs['input_ids'].shape[1]:]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


100 times more powerful than GPT-3, and it’s already being used to create AI-generated art, music, and text. But what does this mean for the future of AI? In this blog post, we’ll explore the potential implications of GPT-4 and how it could change the way we interact with technology.

## What is GPT-4?

GPT-4 is a new type of artificial intelligence (AI) that is being developed by OpenAI. It is a successor to GPT-3, which was released in 2018. GPT-4 is designed to be more powerful and more versatile than GPT-3. It is capable of understanding and generating natural language text, as well as images and other types of data.

GPT-4 is still in development, but it is already being used to create AI-generated art, music, and text. In the future,
