In [None]:
import numpy as np
import pandas as pd

In [None]:
data=pd.read_csv('/content/dataset.csv')

In [None]:
data.head()

Unnamed: 0,natural_language,terminal_command
0,List all files in the current directory,ls
1,Show me all the files in this directory,ls
2,Can you display the files in the current folder?,ls
3,Please list everything in this directory,ls
4,Show all files in the current directory,ls


In [None]:
train_nl= data["natural_language"].tolist()
train_bash=data["terminal_command"].tolist()

In [None]:
train_nl[0],train_bash[0]

('List all files in the current directory', 'ls')

In [None]:
inputs=["translate to bash : "+nl for nl in train_nl]

In [None]:
inputs[:2]

['translate to bash : List all files in the current directory',
 'translate to bash : Show me all the files in this directory']

In [None]:
targets=train_bash

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer=AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import Trainer,TrainingArguments,AutoModelForCausalLM

In [None]:
import os
os.environ["WANDB_DISABLED"]="true"



In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
from peft import LoraConfig,get_peft_model

In [None]:
train_data,eval_data=train_test_split(data,test_size=0.1,random_state=42)


 converting to huggingface data format

In [None]:
train_dataset=Dataset.from_pandas(train_data)
eval_dataset=Dataset.from_pandas(eval_data)

tokenizing dataset


In [None]:
def preprocess_function(text):
  model_inputs=tokenizer(text["natural_language"],padding="max_length",truncation=True)
  labels=tokenizer(text["terminal_command"],padding="max_length",truncation=True)
  model_inputs["labels"]=labels["input_ids"]
  return model_inputs

In [None]:
# tokenization

train_dataset=train_dataset.map(preprocess_function,batched=True)

eval_dataset=eval_dataset.map(preprocess_function,batched=True)

Map:   0%|          | 0/577 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [None]:
# to improve precision

In [None]:
# !pip uninstall -y bitsandbytes
# !pip install -U bitsandbytes


In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig (
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16, # to improve precision
    bnb_4bit_use_double_quant = True # Double quantization reduces memory usage
)


In [None]:
# Load the model

model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", quantization_config = bnb_config, device_map="auto")

In [None]:
lora_config=LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"



)

In [None]:
model=get_peft_model(model,lora_config)


In [None]:
from accelerate import infer_auto_device_map

In [None]:
device_map = infer_auto_device_map(model, max_memory = {0: "12GiB", "cpu": "2GiB"})


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./results/logs",
    logging_steps=10,
    save_steps=500,
    do_eval=True,
    eval_steps=500,
    save_total_limit=2
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset


)

In [None]:
trainer.train()

Step,Training Loss
10,5.7582
20,3.8178
30,1.9183
40,0.6784
50,0.2158
60,0.1016
70,0.0763
80,0.0848
90,0.0776
100,0.0778


In [None]:

model.save_pretrained('/content/saved_model')
tokenizer.save_pretrained('/content/saved_model')

('/content/saved_model/tokenizer_config.json',
 '/content/saved_model/special_tokens_map.json',
 '/content/saved_model/chat_template.jinja',
 '/content/saved_model/tokenizer.model',
 '/content/saved_model/added_tokens.json',
 '/content/saved_model/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load saved model & tokenizer
model_path = "/content/saved_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Encode prompt
prompt = input("> ")
inputs = tokenizer(prompt, return_tensors="pt")

# Generate output
outputs = model.generate(**inputs, max_length=50)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text)


> Please list everything in this directory	
Please list everything in this directory	


### NEED A LOT MORE EPOCHS TO BE TRAINED

In [None]:
!zip -r saved_model.zip /content/saved_model
from google.colab import files
files.download("saved_model.zip")


updating: content/saved_model/ (stored 0%)
updating: content/saved_model/special_tokens_map.json (deflated 79%)
updating: content/saved_model/README.md (deflated 66%)
updating: content/saved_model/tokenizer_config.json (deflated 69%)
updating: content/saved_model/tokenizer.json (deflated 85%)
updating: content/saved_model/tokenizer.model (deflated 55%)
updating: content/saved_model/adapter_config.json (deflated 56%)
updating: content/saved_model/adapter_model.safetensors (deflated 8%)
updating: content/saved_model/chat_template.jinja (deflated 60%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download("saved_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>