# Installing Dependencies

In [None]:
!huggingface-cli login # LOGIN FIRST


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.c

In [None]:
! pip install -U -q accelerate transformers einops datasets peft bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

# Importing Dependencies

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import os

# Finetuning

In [None]:
# Load the model
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-1_5",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

# Lora Config
config = LoraConfig(
    r=16,
    lora_alpha=16,
    #target_modules=["Wqkv", "out_proj"],
    #target_modules = None,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 11,010,048 || all params: 1,429,280,768 || trainable%: 0.7703208667256062


In [None]:
from datasets import load_dataset, concatenate_datasets, Dataset
import torch
import random
import sympy as sp
import sqlite3
import pandas as pd
import sqlite3
import re
import numpy as np

In [None]:
def tokenize(sample):
    model_inps =  tokenizer(sample["text"], padding=True, truncation=True, max_length=512)
    return model_inps

def format_options(options):
    formatted_options = ', '.join(options)
    return formatted_options

def load_process_tokenize(num_rows, data_df, teacher=False):
    data_df = data_df.iloc[:num_rows].copy()

    if teacher:
        data_df["text"] = data_df.apply(lambda x: "Instruction: " + x["question"] + " options: " + format_options(x["options"]) + "\Output: " + x["LLMs_rationale"], axis=1)
    else:
        data_df["text"] = data_df.apply(lambda x: "Instruction: " + x["question"] + " options: " + format_options(x["options"]) + "\Output: " + x["rationale"], axis=1)

    processed_data = Dataset.from_pandas(data_df)
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_data = processed_data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=processed_data.column_names)

    return tokenized_data

def load_db(database_name, table_name, num_rows, seed=42):

    conn = sqlite3.connect(database_name)
    query = f"SELECT * FROM {table_name}"
    df_custom = pd.read_sql_query(query, conn)
    conn.close()
    df_custom_shuffled = df_custom.sample(frac=1, random_state=seed).reset_index(drop=True)
    df_custom_reduced = df_custom_shuffled.head(num_rows)

    return df_custom_reduced

In [None]:
data_df.head()

Unnamed: 0,question,options,rationale,correct
0,"Two friends plan to walk along a 43-km trail, ...","[A)21, B)21.5, C)22, D)22.5, E)23]","If Q complete x kilometers, then P completes 1...",E
1,"In the coordinate plane, points (x, 1) and (5,...","[A)4 and 1, B)1 and 5, C)5 and 1, D)3 and 5, E...",Line k passes through the origin and has slope...,C
2,"For all numbers p and q, the operation @ is de...","[A)II, B)I and II, C)I and III, D)II and III, ...",p@q = p^2 - pq=p(p-q).... so p@q will be zero ...,B
3,Carl is facing very difficult financial times ...,"[A)$1600, B)$2000, C)$2150, D)$2500, E)$12000]","Usually, you are given the annual rate of inte...",A
4,The speed at which a man can row a boat in sti...,"[A)18 seconds, B)27 seconds, C)26 seconds, D)1...",Speed of the boat downstream = 25 +11\n= 36 km...,E


In [None]:
# AQUA RAT TRAINING DATASET
data = load_dataset("aqua_rat", split="train")
data_df = data.to_pandas()
tokenized_data = load_process_tokenize(2000, data_df)

# do all with 2 epochs

Downloading readme:   0%|          | 0.00/5.89k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/76.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/97467 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/254 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/254 [00:00<?, ? examples/s]

Tokenizing data:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# TEACHER LLM TRAINING DATASET - UPLOAD TEACHER DATASET TO COLAB FIRST
#teacher_df = load_db("/content/teacher_llm_dataset.db", "LLM_results", 2000, seed=42)
#tokenized_data = load_process_tokenize(2000, teacher_df, teacher=True)

Tokenizing data:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# SET MODEL NAME HERE
finetuned_model_name = 'phi-1_5-finetuned-aqua-rat-2k'

training_arguments = TrainingArguments(
        output_dir=finetuned_model_name,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=100,
        #max_steps=1000,
        num_train_epochs=2, # ORIGINALY 1
        push_to_hub=True
    )

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()
trainer.push_to_hub()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
100,1.874
200,1.7883
300,1.712
400,1.6961
500,1.683
600,1.5735
700,1.5359
800,1.552
900,1.5243
1000,1.5154


Checkpoint destination directory phi-1_5-finetuned-aqua-rat-2k/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory phi-1_5-finetuned-aqua-rat-2k/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [None]:
from IPython.display import HTML, display

def disconnect_runtime():
  display(HTML('<script>google.colab.kernel.disconnect()</script>'))

disconnect_runtime()