In [1]:
import torch
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
import json
import mlflow
import os

In [2]:
dataset = load_dataset('lmsys/lmsys-chat-1m')

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'model', 'conversation', 'turn', 'language', 'openai_moderation', 'redacted'],
        num_rows: 1000000
    })
})

In [10]:
dataset['train'][1245]

{'conversation_id': '92f7aa082a94463a99f41d4f9dcf5a37',
 'model': 'koala-13b',
 'conversation': [{'content': 'hello how are you', 'role': 'user'},
  {'content': "Hello! I am an AI language model, so I don't have feelings, but I'm here to assist you with any questions or tasks you might have. How can I help you today?",
   'role': 'assistant'},
  {'content': 'ok, thank you', 'role': 'user'},
  {'content': "u're welcome! Is there anything else I can assist you with?",
   'role': 'assistant'},
  {'content': 'why do you use u but not you?', 'role': 'user'},
  {'content': 'the contraction "u" as a way to simplify my language and make it easier for you to understand. It\'s a common convention used in informal writing and texting.',
   'role': 'assistant'},
  {'content': 'you need to be formal when you talk with me', 'role': 'user'},
  {'content': 'ize if my previous response was not formal enough. I will make sure to be more formal in my future interactions with you.',
   'role': 'assistant'

In [5]:
dataset = dataset.filter(lambda conv: conv['language'] == 'English', num_proc=4) # filtering all the other languages

Filter (num_proc=4):   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [6]:
dict_cols = [key for key in dict(dataset['train'][0])]

In [7]:
dict_cols

['conversation_id',
 'model',
 'conversation',
 'turn',
 'language',
 'openai_moderation',
 'redacted']

## Format according to ChatML 

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import hf_olmo

In [9]:
model_ckpt = "allenai/OLMo-1B"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [11]:
print(tokenizer.chat_template)

None


In [12]:
print(tokenizer.default_chat_template)


No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}


In [13]:
tokenizer.apply_chat_template(dataset['train'][0]['conversation'], tokenize=False, add_generation_prompt=False)

"<|im_start|>user\nhow can identity protection services help protect me against identity theft<|im_end|>\n<|im_start|>assistant\nIdentity protection services can help protect you against identity theft in several ways:\n\n1. Monitoring: Many identity protection services monitor your credit reports, public records, and other sources for signs of identity theft. If they detect any suspicious activity, they will alert you so you can take action.\n2. Credit freeze: Some identity protection services can help you freeze your credit, which makes it more difficult for thieves to open new accounts in your name.\n3. Identity theft insurance: Some identity protection services offer insurance that can help you recover financially if you become a victim of identity theft.\n4. Assistance: Many identity protection services offer assistance if you become a victim of identity theft. They can help you file a police report, contact credit bureaus, and other steps to help you restore your identity.\n\nOve

## adding tokens to OLMO

In [14]:
special_tokens = ["<|im_start|>", "<|im_end|>"]

In [15]:
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

2

In [16]:
def format_chat(ex):
   
    chat = ex['conversation']
    
    formatted_chat = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=False,
    )+ tokenizer.eos_token 
    
    tokenized_output = tokenizer(
            formatted_chat,
            add_special_tokens = False,
            padding="max_length",
            max_length=1024,
            truncation=True
    )
    
    return tokenized_output

In [17]:
lmsys_tokenized = dataset.map(format_chat, num_proc=16).remove_columns(dict_cols)

Map (num_proc=16):   0%|          | 0/777453 [00:00<?, ? examples/s]

In [18]:
lmsys_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 777453
    })
})

## loading model

In [42]:
model = AutoModelForCausalLM.from_pretrained(
    model_ckpt,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Some weights of OLMoForCausalLM were not initialized from the model checkpoint at allenai/OLMo-1B and are newly initialized: ['model.transformer.ff_out.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
data_collator= DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [20]:
from transformers import set_seed

set_seed(123)

olmo_tokenized_split = lmsys_tokenized["train"].train_test_split(
    train_size=60000, test_size=10000)

In [22]:
!export MLFLOW_EXPERIMENT_NAME=fraud-detection



In [23]:
!mlflow experiments create --experiment-name olmo1b_chat

Created experiment 'olmo1b_chat' with id 701987021258002861


In [None]:
#Created experiment 'olmo1b_chat' with id 701987021258002861

In [29]:
mlflow_tracking_path = os.path.join('../mlflow_results', 'olmo_orcaslim_instruct')

In [32]:
if not os.path.exists(mlflow_tracking_path):
    os.mkdir(mlflow_tracking_path)

In [33]:
OUTPUT_DIR = os.path.join(mlflow_tracking_path, 'output')
LOG_DIR = os.path.join(mlflow_tracking_path, "logs")

In [39]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=4,
    auto_find_batch_size=True,
    gradient_accumulation_steps=8,
    warmup_steps=1,
    weight_decay=0.01,
    logging_dir=LOG_DIR,
    logging_steps=5,  # Log every 5 steps
    evaluation_strategy="epoch",
    lr_scheduler_type="linear",
    bf16=True,
    gradient_checkpointing=False,
    save_steps=1000,
    learning_rate=1e-5
)

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=olmo_tokenized_split["train"],
    eval_dataset=olmo_tokenized_split["test"],
    data_collator=data_collator,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [44]:
experiment_id = "701987021258002861"

In [None]:
mlflow.set_tracking_uri(mlflow_tracking_path)
mlflow.set_experiment(experiment_id)
with mlflow.start_run(log_system_metrics=True):
    trainer.evaluate() # eval before starting tuning
    trainer.train()
    mlflow.log_params(training_args.to_dict())
    