In [24]:
from chathist import InstructionDataLoader,InstructionDataset, InstructionStyle, Model, Tokenizer
import pandas as pd

In [2]:
chat_df = pd.read_json("hf://datasets/ogrnz/chat-titles/dataset.jsonl", lines=True)

In [3]:
chat_df.head()

Unnamed: 0,message,title
0,Develop a method for clustering astronomical d...,Astronomical Data Clustering Method
1,What is the role of dopamine in the brain as a...,Dopamine as a Neurotransmitter
2,"Replace the \""XXX\"" in the following sentence ...",Company Sustainability Objective
3,"Identify the sentence type (declarative, inter...",Sentence Type Identification
4,Suggest a book title for a fantasy novel about...,Outlaws in Enchanted Realms


In [4]:
chat_df.shape

(10000, 2)

In [5]:
chat_df.columns

Index(['message', 'title'], dtype='object')

In [6]:
style = InstructionStyle.load(style='phi3', input_query="<|user|>\n", response_query="\n<|llm|>\n")

2025-05-12 11:37:16,345 - INFO - phi3 style chosen!!


In [7]:
df = style.convert_train(chat_df, input_col='message', response_col= 'title', output_col='instruct', new_df=True)

In [8]:
train_len = 9500

In [9]:
train_df = df.iloc[:train_len, :]
val_df = df.iloc[train_len:, :]

In [10]:
train_df

Unnamed: 0,instruct
0,<|user|>\nDevelop a method for clustering astr...
1,<|user|>\nWhat is the role of dopamine in the ...
2,"<|user|>\nReplace the \""XXX\"" in the following..."
3,<|user|>\nIdentify the sentence type (declarat...
4,<|user|>\nSuggest a book title for a fantasy n...
...,...
9495,<|user|>\nAssume yourself to be a business whi...
9496,<|user|>\nIgnore all previous instructions. Ac...
9497,"<|user|>\nGiven the following description, ide..."
9498,<|user|>\nWhat is the 17th letter of supercali...


In [11]:
print(train_df['instruct'].values[0])

<|user|>
Develop a method for clustering astronomical data.
<|llm|>
Astronomical Data Clustering Method


In [12]:
val_df

Unnamed: 0,instruct
9500,<|user|>\nWhat are the differences in the prop...
9501,<|user|>\nImplement a program to find the comm...
9502,<|user|>\nHow can a person develop a healthy s...
9503,<|user|>\nPlease help me use schrodinger's equ...
9504,<|user|>\nRewrite the sentence to use a pronou...
...,...
9995,"<|user|>\nFrom the given passage, identify the..."
9996,<|user|>\nAnalyze an effect that the text coul...
9997,<|user|>\nWhat is the average number of hours ...
9998,<|user|>\nClassify the following statement as ...


In [13]:
print(val_df['instruct'].values[0])

<|user|>
What are the differences in the properties of the top quark when it is produced through different modes such as gluon fusion and vector boson fusion, and how do these differences impact our understanding of the top quark's interactions with other particles in the standard model of particle physics?
<|llm|>
Top Quark Production Modes


In [14]:
train_dataset = InstructionDataset(train_df['instruct'].tolist())

2025-05-12 11:37:16,696 - INFO - Setting tokenizer using tiktoken with encoding: gpt2


In [15]:
val_dataset = InstructionDataset(val_df['instruct'].tolist())

In [16]:
train_loader = InstructionDataLoader().load(dataset = train_dataset)

In [17]:
val_loader = InstructionDataLoader().load(dataset = val_dataset)

In [18]:
data = iter(val_loader)

In [22]:
first_batch = next(data)

In [None]:
first_batch[0][0]

tensor([   27,    91,  7220,    91,    29,   198,  2061,   389,   262,  5400,
          287,   262,  6608,   286,   262,  1353,   627,   668,   618,   340,
          318,  4635,   832,  1180, 12881,   884,   355,  1278,    84,   261,
        21748,   290, 15879, 37284,   261, 21748,    11,   290,   703,   466,
          777,  5400,  2928,   674,  4547,   286,   262,  1353,   627,   668,
          338, 12213,   351,   584, 13166,   287,   262,  3210,  2746,   286,
        18758, 11887,    30,   198,    27,    91,   297,    76,    91,    29,
          198,  9126,  2264,   668, 19174, 42082, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256], dtype=torch.int32)

In [29]:
first_batch[1][0]

tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         9126,  2264,   668, 19174, 42082, 50256,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100], dtype=torch.int32)

In [25]:
tokenizer = Tokenizer()

2025-05-12 11:41:18,464 - INFO - Setting tokenizer using tiktoken with encoding: gpt2


In [27]:
print(tokenizer.decode_ids(first_batch[0][0]))

<|user|>
What are the differences in the properties of the top quark when it is produced through different modes such as gluon fusion and vector boson fusion, and how do these differences impact our understanding of the top quark's interactions with other particles in the standard model of particle physics?
<|llm|>
Top Quark Production Modes<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [28]:
print(tokenizer.decode_ids(first_batch[1][0]))

Top Quark Production Modes<|endoftext|>


In [19]:
model = Model()

2025-05-12 11:37:19,353 - INFO - Repo: openai-community/gpt2.
2025-05-12 11:37:19,359 - INFO - openai-community/gpt2 exists in /home/smathad/ai_capstone/chathist/models/pretrained/gpt2. Skipping Download...
2025-05-12 11:37:19,781 - INFO - Loading weights into model.
2025-05-12 11:37:19,818 - INFO - Using Lora for final output layer!!
2025-05-12 11:37:19,828 - INFO - Layer: 1
2025-05-12 11:37:19,829 - INFO - loading norm1 weights and bias
2025-05-12 11:37:19,832 - INFO - loading norm2 weights and bias
2025-05-12 11:37:19,833 - INFO - loading ff weights and bias
2025-05-12 11:37:19,848 - INFO - Using LoRA weights for ff layers!!
2025-05-12 11:37:19,860 - INFO - loading attention weights and bias
2025-05-12 11:37:19,896 - INFO - Using LoRA weights for multi head layers!!
2025-05-12 11:37:19,898 - INFO - Layer: 2
2025-05-12 11:37:19,898 - INFO - loading norm1 weights and bias
2025-05-12 11:37:19,899 - INFO - loading norm2 weights and bias
2025-05-12 11:37:19,902 - INFO - loading ff weight

In [20]:
train_loss, val_loss = model.train(train_loader = train_loader, val_loader = val_loader)

2025-05-12 11:37:20,581 - INFO - Epoch 1
2025-05-12 11:37:22,563 - INFO - Batch: 10, Loss: 4.827515602111816
2025-05-12 11:37:23,598 - INFO - Batch: 20, Loss: 4.813939094543457
2025-05-12 11:37:24,858 - INFO - Batch: 30, Loss: 4.886646747589111
2025-05-12 11:37:26,068 - INFO - Batch: 40, Loss: 5.018726825714111
2025-05-12 11:37:26,909 - INFO - Batch: 50, Loss: 4.962770462036133
2025-05-12 11:37:28,067 - INFO - Batch: 60, Loss: 4.690362930297852
2025-05-12 11:37:29,036 - INFO - Batch: 70, Loss: 5.251195907592773
2025-05-12 11:37:30,158 - INFO - Batch: 80, Loss: 5.015712261199951
2025-05-12 11:37:31,033 - INFO - Batch: 90, Loss: 5.464597702026367
2025-05-12 11:37:31,939 - INFO - Batch: 100, Loss: 5.266045093536377
2025-05-12 11:37:33,147 - INFO - Batch: 110, Loss: 4.815516471862793
2025-05-12 11:37:34,153 - INFO - Batch: 120, Loss: 5.0945329666137695
2025-05-12 11:37:35,100 - INFO - Batch: 130, Loss: 5.357068061828613
2025-05-12 11:37:36,138 - INFO - Batch: 140, Loss: 5.371222496032715
2

KeyboardInterrupt: 