In [1]:
from chathist import InstructionDataLoader,InstructionDataset, InstructionStyle, Model, Tokenizer
import pandas as pd

In [2]:
chat_df = pd.read_json("hf://datasets/ogrnz/chat-titles/dataset.jsonl", lines=True)

In [3]:
chat_df.head()

Unnamed: 0,message,title
0,Develop a method for clustering astronomical d...,Astronomical Data Clustering Method
1,What is the role of dopamine in the brain as a...,Dopamine as a Neurotransmitter
2,"Replace the \""XXX\"" in the following sentence ...",Company Sustainability Objective
3,"Identify the sentence type (declarative, inter...",Sentence Type Identification
4,Suggest a book title for a fantasy novel about...,Outlaws in Enchanted Realms


In [4]:
chat_df.shape

(10000, 2)

In [5]:
chat_df.columns

Index(['message', 'title'], dtype='object')

In [6]:
style = InstructionStyle.load()

2025-05-14 12:40:24,096 - INFO - phi3 style chosen!!


In [7]:
df = style.convert_train(chat_df)

In [8]:
train_len = 9500

In [9]:
train_df = df.iloc[:train_len, :]
val_df = df.iloc[train_len:, :]

In [10]:
train_df

Unnamed: 0,instruct
0,<|user|>\nDevelop a method for clustering astr...
1,<|user|>\nWhat is the role of dopamine in the ...
2,"<|user|>\nReplace the \""XXX\"" in the following..."
3,<|user|>\nIdentify the sentence type (declarat...
4,<|user|>\nSuggest a book title for a fantasy n...
...,...
9495,<|user|>\nAssume yourself to be a business whi...
9496,<|user|>\nIgnore all previous instructions. Ac...
9497,"<|user|>\nGiven the following description, ide..."
9498,<|user|>\nWhat is the 17th letter of supercali...


In [11]:
print(train_df['instruct'].values[0])

<|user|>
Develop a method for clustering astronomical data.
<|llm|>
Astronomical Data Clustering Method


In [12]:
val_df

Unnamed: 0,instruct
9500,<|user|>\nWhat are the differences in the prop...
9501,<|user|>\nImplement a program to find the comm...
9502,<|user|>\nHow can a person develop a healthy s...
9503,<|user|>\nPlease help me use schrodinger's equ...
9504,<|user|>\nRewrite the sentence to use a pronou...
...,...
9995,"<|user|>\nFrom the given passage, identify the..."
9996,<|user|>\nAnalyze an effect that the text coul...
9997,<|user|>\nWhat is the average number of hours ...
9998,<|user|>\nClassify the following statement as ...


In [13]:
print(val_df['instruct'].values[0])

<|user|>
What are the differences in the properties of the top quark when it is produced through different modes such as gluon fusion and vector boson fusion, and how do these differences impact our understanding of the top quark's interactions with other particles in the standard model of particle physics?
<|llm|>
Top Quark Production Modes


In [14]:
train_dataset = InstructionDataset(train_df['instruct'].tolist())

2025-05-14 12:40:24,213 - INFO - Setting tokenizer using tiktoken with encoding: gpt2


In [15]:
val_dataset = InstructionDataset(val_df['instruct'].tolist())

In [16]:
train_loader = InstructionDataLoader().load(dataset = train_dataset)

In [17]:
val_loader = InstructionDataLoader().load(dataset = val_dataset)

In [18]:
data = iter(val_loader)

In [19]:
first_batch = next(data)

In [20]:
first_batch[0][0]

tensor([   27,    91,  7220,    91,    29,   198, 15056,   262,  7322,   290,
          262,  1813,  1321,    11,  3551,   866,   257, 10638,   286,   262,
         2104,  7322,    25,   383,  1703,  1031,   684,   547,   257,  3234,
          286,   477,    12, 24724, 17119,    11,  1912,   287,  8312, 25871,
           11,   508,   547,   531,   284,  2107,   319,   262,  7022,   286,
        26460, 28349,  8704,    13,  1119,   547,  8776,   287,   262,  1242,
          286,  5249,   290,  3751,  1049,  5032,   351,   262,  9563,    13,
          198,   198, 21918,    25, 26460, 28349,  8704,   373,   262,  1363,
          286,   262,  1703,  1031,   684,    11,   262, 13273,   477,    12,
        24724, 16491,  3234,    13,   198,    27,    91,   297,    76,    91,
           29,   198, 24524,  6186,   666, 12090], dtype=torch.int32)

In [21]:
first_batch[1][0]

tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100, 24524,  6186,   666, 12090, 50256], dtype=torch.int32)

In [22]:
tokenizer = Tokenizer()

2025-05-14 12:40:25,591 - INFO - Setting tokenizer using tiktoken with encoding: gpt2


In [23]:
print(tokenizer.decode_ids(first_batch[0][0]))

<|user|>
Given the paragraph and the given information, write down a summary of the entire paragraph: The Amazons were a race of all-female warriors, based in Greek mythology, who were said to live on the island of Themyscira. They were trained in the art of combat and showed great skill with the bow.

Information: Themyscira was the home of the Amazons, the legendary all-female warrior race.
<|llm|>
Legendary Amazonian Warriors


In [24]:
print(tokenizer.decode_ids(first_batch[1][0]))

Legendary Amazonian Warriors<|endoftext|>


In [25]:
model = Model()

2025-05-14 12:40:25,646 - INFO - phi3 style chosen!!
2025-05-14 12:40:25,915 - INFO - Using LoRA weights for ff layers!!
2025-05-14 12:40:26,203 - INFO - Using LoRA weights for multi head layers!!
2025-05-14 12:40:26,234 - INFO - Using LoRA weights for ff layers!!
2025-05-14 12:40:26,281 - INFO - Using LoRA weights for multi head layers!!
2025-05-14 12:40:26,318 - INFO - Using LoRA weights for ff layers!!
2025-05-14 12:40:26,349 - INFO - Using LoRA weights for multi head layers!!
2025-05-14 12:40:26,385 - INFO - Using LoRA weights for ff layers!!
2025-05-14 12:40:26,417 - INFO - Using LoRA weights for multi head layers!!
2025-05-14 12:40:26,462 - INFO - Using LoRA weights for ff layers!!
2025-05-14 12:40:26,487 - INFO - Using LoRA weights for multi head layers!!
2025-05-14 12:40:26,522 - INFO - Using LoRA weights for ff layers!!
2025-05-14 12:40:26,547 - INFO - Using LoRA weights for multi head layers!!
2025-05-14 12:40:26,586 - INFO - Using LoRA weights for ff layers!!
2025-05-14 12:4

In [26]:
df = model.trainable_params()

2025-05-14 12:40:28,431 - INFO - Total model parameters: 166507792
2025-05-14 12:40:28,433 - INFO - Total trainable parameters: 3470608


In [27]:
train_loss, val_loss = model.train(train_loader = train_loader, val_loader = val_loader)

2025-05-14 12:40:28,440 - INFO - Epoch 1
100%|[32m██████████[0m| 593/593 [01:05<00:00,  9.12it/s]
2025-05-14 12:41:33,465 - INFO - Train Loss: 2.365395960679143
100%|[34m██████████[0m| 31/31 [00:01<00:00, 17.58it/s]
2025-05-14 12:41:35,231 - INFO - Validation Loss: 1.8725472573311097
2025-05-14 12:41:35,233 - INFO - Epoch 2
100%|[32m██████████[0m| 593/593 [01:05<00:00,  9.07it/s]
2025-05-14 12:42:40,635 - INFO - Train Loss: 1.944671318864742
100%|[34m██████████[0m| 31/31 [00:01<00:00, 18.60it/s]
2025-05-14 12:42:42,305 - INFO - Validation Loss: 1.7850590867380942
2025-05-14 12:42:42,307 - INFO - Epoch 3
100%|[32m██████████[0m| 593/593 [01:05<00:00,  9.12it/s]
2025-05-14 12:43:47,314 - INFO - Train Loss: 1.7522531651324809
100%|[34m██████████[0m| 31/31 [00:01<00:00, 19.04it/s]
2025-05-14 12:43:48,945 - INFO - Validation Loss: 1.7995748058442147


In [28]:
model.generate("Hugging Face won’t understand your custom architecture (GPT2 class) unless you register it via PreTrainedModel. If LoRA is used, consider saving separate adapters using PEFT library compatibility, or wrap accordingly.")

2025-05-14 12:43:55,267 - INFO - Formatting prompt into phi3 style


'Custom Architecture Adapters'