In [1]:
from chathist import Tokenizer, InstructionDataLoader,InstructionDataset, InstructionStyle
import pandas as pd

### Trying the same of chat dataset.

In [2]:
chat_df = pd.read_json("hf://datasets/ogrnz/chat-titles/dataset.jsonl", lines=True)

In [3]:
chat_df.columns

Index(['message', 'title'], dtype='object')

In [4]:
chat_df.head()

Unnamed: 0,message,title
0,Develop a method for clustering astronomical d...,Astronomical Data Clustering Method
1,What is the role of dopamine in the brain as a...,Dopamine as a Neurotransmitter
2,"Replace the \""XXX\"" in the following sentence ...",Company Sustainability Objective
3,"Identify the sentence type (declarative, inter...",Sentence Type Identification
4,Suggest a book title for a fantasy novel about...,Outlaws in Enchanted Realms


### Setting tokenizer


In [5]:
tokenizer = Tokenizer()

2025-05-14 15:00:40,420 - INFO - Setting tokenizer using tiktoken with encoding: gpt2


### Creating a dataset using phi3

In [6]:
style = InstructionStyle.load()

2025-05-14 15:00:41,469 - INFO - phi3 style chosen!!


In [7]:
df = style.convert_train(chat_df)

In [8]:
df.head()

Unnamed: 0,instruct
0,<|user|>\nDevelop a method for clustering astr...
1,<|user|>\nWhat is the role of dopamine in the ...
2,"<|user|>\nReplace the \""XXX\"" in the following..."
3,<|user|>\nIdentify the sentence type (declarat...
4,<|user|>\nSuggest a book title for a fantasy n...


### Using InstructionDataset

In [9]:
dataset = InstructionDataset(df['instruct'].tolist())

2025-05-14 15:00:41,543 - INFO - Setting tokenizer using tiktoken with encoding: gpt2


In [10]:
loader = InstructionDataLoader()

### Using InstructionLoader without masking inputs (Phi3)

In [11]:
dataloader = loader.load(dataset=dataset)

In [12]:
data = iter(dataloader)

In [13]:
first_batch = next(data)

In [14]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [15]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Describe the process of creating a digital product in five steps
<|llm|>
Digital Product Creation Process<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [16]:
sample_response = first_batch[1][2]

In [17]:
print(tokenizer.decode_ids(sample_response))

Digital Product Creation Process<|endoftext|>


In [18]:
dataloader = loader.load(dataset=dataset)

In [19]:
data = iter(dataloader)

In [20]:
first_batch = next(data)

In [21]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [22]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Write an blogpost about motivation people to writing an blogpost inside of the company
<|llm|>
Encouraging Internal Blogging Culture<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><

In [23]:
sample_response = first_batch[1][2]

In [24]:
print(tokenizer.decode_ids(sample_response))

Encouraging Internal Blogging Culture<|endoftext|>


### Using InstructionLoader with masking inputs (Phi3)

In [25]:
dataloader = loader.load(dataset=dataset)

In [26]:
data = iter(dataloader)

In [27]:
first_batch = next(data)

In [28]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [29]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Create a webpage that displays a contact us form.
<|llm|>
Contact Form Webpage<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|e

In [30]:
sample_response = first_batch[1][2]

In [31]:
print(tokenizer.decode_ids(sample_response))

Contact Form Webpage<|endoftext|>


In [32]:
dataloader = loader.load(dataset=dataset)

In [33]:
data = iter(dataloader)

In [34]:
first_batch = next(data)

In [35]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [36]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Classify the following sentence using the following emotion categories: fear, anger, joy, sadness: I'm so proud of myself.
<|llm|>
Emotion Category Classification<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [37]:
sample_response = first_batch[1][2]

In [38]:
print(tokenizer.decode_ids(sample_response))

Emotion Category Classification<|endoftext|>


### Creating a dataset using alpaca

In [39]:
style = InstructionStyle.load()

2025-05-14 15:00:42,101 - INFO - phi3 style chosen!!


In [40]:
df = style.convert_train(chat_df)

In [41]:
df.head()

Unnamed: 0,instruct
0,<|user|>\nDevelop a method for clustering astr...
1,<|user|>\nWhat is the role of dopamine in the ...
2,"<|user|>\nReplace the \""XXX\"" in the following..."
3,<|user|>\nIdentify the sentence type (declarat...
4,<|user|>\nSuggest a book title for a fantasy n...


### Using InstructionDataset

In [42]:
dataset = InstructionDataset(df['instruct'].tolist())

In [43]:
loader = InstructionDataLoader()

### Using InstructionLoader without masking inputs (alpaca)

In [44]:
dataloader = loader.load(dataset=dataset)

In [45]:
data = iter(dataloader)

In [46]:
first_batch = next(data)

In [47]:
sample_input = first_batch[0][2]

In [48]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Replace the verb with one that has a similar meaning, but is in the present perfect: I made a mistake
<|llm|>
Corrected an Error<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|end

In [49]:
sample_response = first_batch[1][2]

### Response without masking inputs alpaca

-100 is neglected by the tokenizer

In [50]:
print(tokenizer.decode_ids(sample_response))

Corrected an Error<|endoftext|>


### Using InstructionLoader without masking inputs (alpaca)

In [51]:
dataloader = loader.load(dataset=dataset)

In [52]:
data = iter(dataloader)

In [53]:
first_batch = next(data)

In [54]:
sample_input = first_batch[0][2]

In [55]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Change the tone of the following sentence from formal to informal: I regret to inform you that I will not be able to attend the meeting.
<|llm|>
Tone Transformation Request<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>

In [56]:
sample_response = first_batch[1][2]

### Response without masking inputs alpaca

-100 is neglected by the tokenizer

In [57]:
print(tokenizer.decode_ids(sample_response))

Tone Transformation Request<|endoftext|>
