In [1]:
from chathist import Tokenizer, InstructionDataLoader,InstructionDataset, InstructionStyle
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### Trying the same of chat dataset.

In [2]:
chat_df = pd.read_json("hf://datasets/ogrnz/chat-titles/dataset.jsonl", lines=True)

In [3]:
chat_df.columns

Index(['message', 'title'], dtype='object')

In [4]:
chat_df.head()

Unnamed: 0,message,title
0,Develop a method for clustering astronomical d...,Astronomical Data Clustering Method
1,What is the role of dopamine in the brain as a...,Dopamine as a Neurotransmitter
2,"Replace the \""XXX\"" in the following sentence ...",Company Sustainability Objective
3,"Identify the sentence type (declarative, inter...",Sentence Type Identification
4,Suggest a book title for a fantasy novel about...,Outlaws in Enchanted Realms


### Setting tokenizer


In [5]:
tokenizer = Tokenizer()

2025-05-10 14:34:12,902 - INFO - Setting tokenizer using tiktoken with encoding: gpt2


### Creating a dataset using phi3

In [6]:
style = InstructionStyle.load(style='phi3', input_query="<|user|>\n", response_query="\n<|llm|>\n")

2025-05-10 14:34:13,024 - INFO - phi3 style chosen!!


In [7]:
df = style.convert_train(chat_df, input_col='message', response_col= 'title', output_col='instruct', new_df=True)

In [8]:
df.head()

Unnamed: 0,instruct
0,<|user|>\nDevelop a method for clustering astr...
1,<|user|>\nWhat is the role of dopamine in the ...
2,"<|user|>\nReplace the \""XXX\"" in the following..."
3,<|user|>\nIdentify the sentence type (declarat...
4,<|user|>\nSuggest a book title for a fantasy n...


### Using InstructionDataset

In [9]:
dataset = InstructionDataset(df['instruct'].tolist())

2025-05-10 14:34:13,107 - INFO - Setting tokenizer using tiktoken with encoding: gpt2


In [10]:
loader = InstructionDataLoader()

### Using InstructionLoader without masking inputs (Phi3)

In [11]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [12]:
data = iter(dataloader)

In [13]:
first_batch = next(data)

In [14]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [15]:
print(tokenizer.decode_ids(sample_input))

<|user|>
How does malnutrition impact the development and functioning of the brain?
<|llm|>
Malnutrition's Brain Impact<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [16]:
sample_response = first_batch[1][2]

In [17]:
print(tokenizer.decode_ids(sample_response))

|user|>
How does malnutrition impact the development and functioning of the brain?
<|llm|>
Malnutrition's Brain Impact<|endoftext|>


In [18]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [19]:
data = iter(dataloader)

In [20]:
first_batch = next(data)

In [21]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [22]:
print(tokenizer.decode_ids(sample_input))

<|user|>
What's the answer to this cryptic crossword clue?

Promising new Russian head of government (8)
<|llm|>
Cryptic Crossword Clue<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [23]:
sample_response = first_batch[1][2]

In [24]:
print(tokenizer.decode_ids(sample_response))

|user|>
What's the answer to this cryptic crossword clue?

Promising new Russian head of government (8)
<|llm|>
Cryptic Crossword Clue<|endoftext|>


### Using InstructionLoader with masking inputs (Phi3)

In [25]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=True)

In [26]:
data = iter(dataloader)

In [27]:
first_batch = next(data)

2025-05-10 14:34:13,552 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:13,555 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:13,556 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:13,556 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:13,557 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:13,557 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:13,560 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:13,561 - INFO - Masking inputs upto response query!!!


In [28]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [29]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Arrange the items in alphabetical order: umbrella, chair, pin
<|llm|>
Alphabetic Order Query<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|

In [30]:
sample_response = first_batch[1][2]

In [31]:
print(tokenizer.decode_ids(sample_response))

Alphabetic Order Query<|endoftext|>


In [32]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [33]:
data = iter(dataloader)

In [34]:
first_batch = next(data)

In [35]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [36]:
print(tokenizer.decode_ids(sample_input))

<|user|>
How do changes in the electronic and magnetic properties of molecular magnets affect their stability and potential for use in magnetic data storage applications?
<|llm|>
Molecular Magnet Properties<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [37]:
sample_response = first_batch[1][2]

In [38]:
print(tokenizer.decode_ids(sample_response))

|user|>
How do changes in the electronic and magnetic properties of molecular magnets affect their stability and potential for use in magnetic data storage applications?
<|llm|>
Molecular Magnet Properties<|endoftext|>


### Creating a dataset using alpaca

In [39]:
prompt = "###You are a very smart llm and given the input query your job is to predict the title of the input.\n"

In [40]:
style = InstructionStyle.load(style='alpaca', prompt= prompt, input_query="### Input:\n", response_query="\n### Response:\n")

2025-05-10 14:34:13,693 - INFO - alpaca style chosen!!


In [41]:
df = style.convert_train(chat_df, input_col='message', response_col= 'title', output_col='instruct', new_df=True)

In [42]:
df.head()

Unnamed: 0,instruct
0,###You are a very smart llm and given the inpu...
1,###You are a very smart llm and given the inpu...
2,###You are a very smart llm and given the inpu...
3,###You are a very smart llm and given the inpu...
4,###You are a very smart llm and given the inpu...


### Using InstructionDataset

In [43]:
dataset = InstructionDataset(df['instruct'].tolist())

In [44]:
loader = InstructionDataLoader()

### Using InstructionLoader without masking inputs (alpaca)

In [45]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [46]:
data = iter(dataloader)

In [47]:
first_batch = next(data)

In [48]:
sample_input = first_batch[0][2]

In [49]:
print(tokenizer.decode_ids(sample_input))

###You are a very smart llm and given the input query your job is to predict the title of the input.
### Input:
Act as a funnel agency owner who's worked with 100 clients and collected a list of all the "fears" and objections clients have for NOT moving forward and buying your services.

Here's 4 examples, but I want you to create a list of 10 more.
### Response:
Client Objections and Fears<|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [50]:
sample_response = first_batch[1][2]

### Response without masking inputs alpaca

-100 is neglected by the tokenizer

In [51]:
print(tokenizer.decode_ids(sample_response))

You are a very smart llm and given the input query your job is to predict the title of the input.
### Input:
Act as a funnel agency owner who's worked with 100 clients and collected a list of all the "fears" and objections clients have for NOT moving forward and buying your services.

Here's 4 examples, but I want you to create a list of 10 more.
### Response:
Client Objections and Fears<|endoftext|>


### Using InstructionLoader without masking inputs (alpaca)

In [52]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=True)

In [53]:
data = iter(dataloader)

In [54]:
first_batch = next(data)

2025-05-10 14:34:14,248 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:14,250 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:14,251 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:14,251 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:14,252 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:14,253 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:14,253 - INFO - Masking inputs upto response query!!!
2025-05-10 14:34:14,254 - INFO - Masking inputs upto response query!!!


In [55]:
sample_input = first_batch[0][2]

In [56]:
print(tokenizer.decode_ids(sample_input))

###You are a very smart llm and given the input query your job is to predict the title of the input.
### Input:
Compare and contrast the features of two different ML algorithms: K-means clustering and Decision Tree
### Response:
ML Algorithms Feature Comparison<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [57]:
sample_response = first_batch[1][2]

### Response without masking inputs alpaca

-100 is neglected by the tokenizer

In [58]:
print(tokenizer.decode_ids(sample_response))

ML Algorithms Feature Comparison<|endoftext|>
