#### Data visualization 

In [None]:
import pandas as pd 
# df = pd.read_csv("small_training.csv") 
df = pd.read_csv("AbstractAItraining.csv") 
df.head() 
file1 = "small_training.csv" 
file2 = "AbstractAItraining.csv" 

: 

#### Important Libraries


In [7]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

###### All needed arguments/ parameters 

In [None]:
# model_name = "NousResearch/llama-2-7b-chat-hf" # use this if you have access to the official LLaMA 2 model "meta-llama/Llama-2-7b-chat-hf", though keep in mind you'll need to pass a Hugging Face key argument
# dataset_name = "small_training.csv"
# new_model = "llama-2-7b-custom"
# lora_r = 64
# lora_alpha = 32
# lora_dropout = 0.05
# use_4bit = True
# bnb_4bit_compute_dtype = "float16"
# bnb_4bit_quant_type = "nf4"
# use_nested_quant = False
# output_dir = "./results"
# num_train_epochs = 1
# fp16 = False
# bf16 = False
# # per_device_train_batch_size = 4
# per_device_eval_batch_size = 4
# gradient_accumulation_steps = 1
# gradient_checkpointing = True
# max_grad_norm = 0.3
# learning_rate = 2e-4
# weight_decay = 0.001
# optim = "paged_adamw_32bit"
# lr_scheduler_type = "constant"
# max_steps = -1
# warmup_ratio = 0.03
# group_by_length = True
# save_steps = 25
# logging_steps = 5
# max_seq_length = None
# packing = False
# device_map = {"": 0}

#### If file dont have text column

In [6]:
# Load datasets from CSV
train_dataset = load_dataset('csv', data_files=file1, split="train")  ### file1 is small one and file2 is bigger one  
valid_dataset = load_dataset('csv', data_files=file1, split="train")

#Preprocess datasets ----> do this only for file1
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [prompt + ' ' + response for prompt, response in zip(examples['prompt'], examples['responses'])]}, batched=True)
valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [prompt + ' ' + response for prompt, response in zip(examples['prompt'], examples['responses'])]}, batched=True)


#### BNB 

In [9]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype= "float16", 
    bnb_4bit_use_double_quant= False,
)

#### Model arguments  

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/llama-2-7b-chat-hf",  
    # "facebook/galactica-1.3b", 
    quantization_config=bnb_config, 
    load_in_4bit=True, 
    device_map={"": 0} 
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.92s/it]


In [11]:
model.config.use_cache = False ### 
model.config.pretraining_tp = 1 ###

#### Tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained("NousResearch/llama-2-7b-chat-hf",
                                            trust_remote_code=True
                                         )
tokenizer.pad_token = tokenizer.eos_token 
tokenizer.padding_side = "left" 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### PEFT  -----> documentation of peft

In [14]:
peft_config = LoraConfig( 
    lora_alpha=32, 
    lora_dropout=0.05, 
    r=64, 
    bias="none", 
    task_type="CAUSAL_LM", 
)
model = prepare_model_for_kbit_training(model)
#model = PeftModel.from_pretrained(model , adapters_name)
model = get_peft_model(model, peft_config=peft_config)

#### Adapters 

In [None]:
# adapters_name = 'timdettmers/guanaco-7b'
# model.add_adapter(peft_config, 
#                   adapter_name = adapters_name
#                  )

##### Trainable Parameters 

In [15]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
print_trainable_parameters(model) 

#### WandB

In [None]:
# pip  install wandb 
# import wandb 

In [None]:
os.evniron["WANDB_PROJECT"] = "AbstractAI-Galactica" # log to your project 
os.evinron["WANDB_LOG_MODEL"] = "all" # log your models

#### Set training parameters

In [17]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs= 4,
    per_device_train_batch_size= 10,
    gradient_accumulation_steps=1,
    optim= "paged_adamw_32bit",
    save_steps= 25,
    logging_steps=5,
    warmup_ratio= 0.03,
    learning_rate=2e-4, 
    weight_decay=  0.001,
    fp16=False,
    bf16=False,
    max_grad_norm= 0.3,
    max_steps= -1,
    group_by_length= True,
    # lr_scheduler_type="constant",
    lr_scheduler_type="cosine",
    # report_to="all",
    report_to="wandb",
    evaluation_strategy="steps",
    eval_steps= 80, 
    push_to_hub = True,  
)


#### Trainer

In [None]:
trainer = SFTTrainer(
    model=model, 
    train_dataset=train_dataset_mapped, # Pass training dataset here   
    eval_dataset=valid_dataset_mapped,  # Pass validation dataset here 
    peft_config=peft_config,
    dataset_text_field="text", 
    max_seq_length=None, 
    tokenizer=tokenizer, 
    args=training_arguments, 
    packing=False, 
) 

In [20]:
trainer.train()
wandb.finish()

Step,Training Loss,Validation Loss
5,2.0262,2.00463
10,1.94,1.898936
15,1.8879,1.830923
20,1.7424,1.784984
25,1.87,1.742993
30,1.7324,1.714349


In [None]:
prompt = f"[INST] Generate abstract for the key points\n1. Chimera Design: The design of Chimera is broken into a core and extensions. The core provides basic services and visualization, while the extensions are responsible for higher level functionality, allowing third-party developers to incorporate new features according to their needs.\n2. Multiscale Extension: The Multiscale extension of Chimera allows users to visualize large-scale molecular assemblies such as viral coats. By providing a scale-based approach, it enhances the understanding of molecular structures and interactions in biological research.\n3. Collaboratory Extension: Offering the ability for researchers based in different locations to share a Chimera session interactively, the Collaboratory extension significantly improves collaboration capacity. Through this shared environment, researchers can conduct simultaneous examinations and share insights in real-time.\n4. Other Extensions: Other extensions such as Multalign Viewer, ViewDock, Movie, and Volume Viewer offer a diverse set of features. They allow the display of multiple sequence alignments, screening of docked ligand orientations, replay of molecular dynamics trajectories, and analysis of volumetric data respectively.\n5. Real-World Usage of Chimera: The abstract also discusses the practical usage of Chimera in real-world situations, pointing out its wide applicability and impact in the field of molecular biology and bioinformatics \n . [/INST]"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000)
result = pipe(prompt) 
print(result) 