In [1]:
%pip install trl accelerate bitsandbytes peft einops langchain wandb -qqq 

Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
import json
import re
import random
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments)
from trl import SFTTrainer
import copy
import torch
import torch.nn as nn
from torch.utils.data import dataloader, Dataset

import pandas as pd
from datasets import Dataset as HFDataset
from langchain.prompts import PromptTemplate
import transformers
from datetime import datetime
import numpy as np

from docarray import BaseDoc, DocList
from docarray.typing import NdArray
from docarray.index import InMemoryExactNNIndex



In [2]:
class args:
    model_name = 'mistralai/Mistral-7B-Instruct-v0.2' ### Model Dependent
    quantisation_4_bit = True
    quantisation_8_bit = False
    
    batch_size = 1
    grad_acc_steps = 8
    device = 'cuda'
    
    # tokenizer.pad_token = tokenizer.eos_token # Mostly
    # tokenizer.padding_side = 'right' # Model dependent again


In [3]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    args.model_name,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
        args.model_name,
        # quantization_config=bnb_config if args.quantisation_4_bit else None,# 4-bit quantisation
        # load_in_8bit = True if args.quantisation_8_bit else None, # 8-bit quantisation
        device_map={"": 0}, # Single GPU
        trust_remote_code=True
)

model

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

# Data Part

In [4]:
class Tweet(BaseDoc):
    id: int
    content: str
    likes: int
    date: str
    username: str
    media: str
    content_embeds: NdArray
    image_embeds: NdArray
    inferred_company: str

train_list = DocList[Tweet].pull('file://train_list')
tweet_list = DocList[Tweet].pull('file://tweet_list')



In [5]:

doc_index = InMemoryExactNNIndex[Tweet]()
doc_index.index(tweet_list)

train_index = InMemoryExactNNIndex[Tweet]()
train_index.index(train_list)

def get_rel_docs(query):
    retrieved_docs, scores = doc_index.find(query, search_field='image_embeds', limit=3)
    return retrieved_docs


In [6]:
train = train_list.to_dataframe()

In [39]:
x = """helo\nhayii"""

print(x)

helo
hayii


In [7]:

class DataGen:

    prompt = PromptTemplate.from_template("""<s>[INST] 
Using the given tweets as reference construct a tweet which has analogical similarity to those, conditioned on the fact that the post is a {views} post.

Reference tweets: 
{ref}

The tweet is written by the user {username} belong to {company}.

New Tweet: {predict} </s>""")

    def __init__(self, data):
        self.data = HFDataset.from_pandas(data)
        self.tokenizer = tokenizer

    def classify_likes(self, likes):
        if likes > 10000:
            return "Viral"
        elif likes > 1000:
            return "High Engagement"
        elif likes > 100:
            return "Moderate Engagement"
        else:
            return "Low Engagement"

    def get_ref_tweets(self, entry):
        docs = get_rel_docs(np.array(entry['image_embeds']))
        contents = docs.content
        likes = docs.likes
        users = docs.username
        companies = docs.inferred_company

        ref = ""

        for content, like, user, company in zip(contents, likes, users, companies):
            ref+=f"""Tweet: ```{content}```\nUsername:{user}\nCompany:{company}\nLikes:{self.classify_likes(like)}\n\n"""
            
        return ref
        
    def generate_and_tokenize_prompt(self, entry):
        
        ref_tweets = self.get_ref_tweets(entry)
        prompt = self.prompt.format(views=self.classify_likes(entry['likes']), ref=ref_tweets, username=entry['username'], company=entry['inferred_company'], predict=entry['content'])
        return tokenizer(prompt, return_tensors="pt")

    def get_formatted_dataset(self):
        return self.data.map(self.generate_and_tokenize_prompt)
        
      

In [8]:
data = DataGen(train)

In [9]:
dataset = data.get_formatted_dataset()



Map:   0%|          | 0/9618 [00:00<?, ? examples/s]

In [4]:
import joblib
train_dataset = joblib.load("dataset.joblib")
# joblib.dump(dataset, "dataset.joblib")

In [46]:
# data = dict({"prompt":["""
#     <s>[INST] 
# Using the given tweets as reference construct a tweet which has analogical similarity to those, conditioned on the fact that the post is extremely viral.

# Reference tweets: 

# Tweet: ```What a great day to BE part of the #BTSARMY. Shop the deluxe version of <mention>'s new album: <hyperlink> <hyperlink>```
# Username: Target
# Company: Target

# Tweet: ```Your grand ideas will never go off track. ðŸ’¡ #GalaxyNote20 #GalaxyxBTS <mention> ðŸ‘” Learn more: <hyperlink> <hyperlink>```
# Username: Samsung
# Company: Samsung


# The tweet is written by the user spotify belong to spotify.

# New Tweet: ```Happy <mention> release day #ARMY! 
# #LoveYourselfAnswer is here ðŸ’œ 
# <hyperlink> <hyperlink>```
# """ for _ in range(10)]})


# max_length = 512

# def transform(x):
#     result = tokenizer(
#         x["prompt"],
#         truncation=True,
#         max_length=max_length,
#         padding="max_length",
#     )
#     result["labels"] = result["input_ids"].copy()

#     return result

    
# train_dataset=HFDataset.from_dict(data)
# train_dataset = train_dataset.map(transform)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'content', 'likes', 'date', 'username', 'media', 'content_embeds', 'image_embeds', 'inferred_company', 'input_ids', 'attention_mask'],
    num_rows: 9618
})

In [5]:
def convert_to_tensors(entry):

    entry['input_ids'] = entry['input_ids'].squeeze()
    entry['attention_mask'] =entry['attention_mask'].squeeze()
    return entry

train_dataset2=train_dataset.with_format("torch")
train_dataset2 = train_dataset2.map(convert_to_tensors)

Map:   0%|          | 0/9618 [00:00<?, ? examples/s]

In [32]:
train_dataset2['input_ids'][0].shape

torch.Size([356])

In [20]:
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=train_dataset,
#     eval_dataset = test_dataset,
#     peft_config=peft_config,
#     dataset_text_field="text",
#     max_seq_length=1024, # Adjust accordingly
#     tokenizer=tokenizer,
#     args=training_arguments,
#     packing=True,
# )

# for name, module in trainer.model.named_modules():
#     if "norm" in name:
#         module = module.to(torch.float32)

# trainer.train()

# model.save_pretrained("output_dir") # saves lora again

In [6]:

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model.config.use_cache = False
model = get_peft_model(model, config)
print_trainable_parameters(model)


trainable params: 85041152 || all params: 7326773248 || trainable%: 1.1606903765339511


In [9]:
model.cuda()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_featu

In [10]:

project = "journal-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

# training_args = TrainingArguments(
#         output_dir=output_dir,
#         warmup_steps=1,
#         per_device_train_batch_size=2,
#         gradient_accumulation_steps=1,
#         gradient_checkpointing=True,
#         max_steps=500,
#         learning_rate=2.5e-5, # Want a small lr for finetuning
#         bf16=True,
#         optim="paged_adamw_8bit",
#         logging_steps=25,              # When to start reporting loss
#         logging_dir="./logs",        # Directory for storing logs
#         save_strategy="steps",       # Save the model checkpoint every logging step
#         save_steps=25,                # Save checkpoints every 50 steps
#         evaluation_strategy="steps", # Evaluate the model every logging step
#         eval_steps=25,               # Evaluate and save checkpoints every 50 steps
#         do_eval=True,                # Perform evaluation at the end of training
#         report_to="wandb",           # Comment this out if you don't want to use weights & baises
#         run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
#     )

training_arguments = TrainingArguments(
    output_dir="./results_latest",
    per_device_train_batch_size=args.batch_size,
    gradient_accumulation_steps=args.grad_acc_steps,
    optim='paged_adamw_32bit',
    # save_steps=250,
    fp16=True,
    logging_steps=10,
    save_strategy="steps",       # Save the model checkpoint every logging step
    save_steps=25, 
    learning_rate=2e-4,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to = 'wandb'
)



In [14]:
train_dataset2['input_ids'][0].shape

torch.Size([356])

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset2,
    # eval_dataset=tokenized_val_dataset,
    args=training_arguments,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [55]:
prompt = """
    <s>[INST] 
Using the given tweets as reference construct a tweet which has analogical similarity to those, conditioned on the fact that the post is extremely viral.

Reference tweets: 

Tweet: ```What a great day to BE part of the #BTSARMY. Shop the deluxe version of <mention>'s new album: <hyperlink> <hyperlink>```
Username: Target
Company: Target

Tweet: ```Your grand ideas will never go off track. ðŸ’¡ #GalaxyNote20 #GalaxyxBTS <mention> ðŸ‘” Learn more: <hyperlink> <hyperlink>```
Username: Samsung
Company: Samsung


The tweet is written by the user spotify belong to spotify.

New Tweet:"""

model_input = tokenizer(prompt, return_tensors="pt").to("cuda")

In [57]:
model.eval()
with torch.no_grad():
    tokens = model.generate(**model_input, max_new_tokens=100)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [60]:
print(tokenizer.decode(tokens[0]))

<s> 
    <s> [INST] 
Using the given tweets as reference construct a tweet which has analogical similarity to those, conditioned on the fact that the post is extremely viral.

Reference tweets: 

Tweet: ```What a great day to BE part of the #BTSARMY. Shop the deluxe version of <mention>'s new album: <hyperlink> <hyperlink>```
Username: Target
Company: Target

Tweet: ```Your grand ideas will never go off track. ðŸ’¡ #GalaxyNote20 #GalaxyxBTS <mention> ðŸ‘” Learn more: <hyperlink> <hyperlink>```
Username: Samsung
Company: Samsung


The tweet is written by the user spotify belong to spotify.

New Tweet:</s>
```
ðŸ’¡ðŸ’¡ðŸ’¡

<hyperlink> <hyperlink>

<hyperlink> <hyperlink>

<hyperlink> <hyperlink>

<hyperlink> <hyperlink>

<hyperlink> <hyperlink>

<hyperlink> <hyperlink>

<hyperlink> <hyperlink>

<hyperlink> <hyperlink>

<hyper
