In [1]:
from chathist import Tokenizer, InstructionDataLoader,InstructionDataset, InstructionStyle
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### Trying the same of chat dataset.

In [2]:
chat_df = pd.read_json("hf://datasets/ogrnz/chat-titles/dataset.jsonl", lines=True)

In [3]:
chat_df.columns

Index(['message', 'title'], dtype='object')

In [4]:
chat_df.head()

Unnamed: 0,message,title
0,Develop a method for clustering astronomical d...,Astronomical Data Clustering Method
1,What is the role of dopamine in the brain as a...,Dopamine as a Neurotransmitter
2,"Replace the \""XXX\"" in the following sentence ...",Company Sustainability Objective
3,"Identify the sentence type (declarative, inter...",Sentence Type Identification
4,Suggest a book title for a fantasy novel about...,Outlaws in Enchanted Realms


### Setting tokenizer


In [5]:
tokenizer = Tokenizer()

### Creating a dataset using phi3

In [6]:
style = InstructionStyle.load(style='phi3', input_query="<|user|>\n", response_query="\n<|llm|>\n")

In [7]:
df = style.convert_train(chat_df, input_col='message', response_col= 'title', output_col='instruct', new_df=True)

In [8]:
df.head()

Unnamed: 0,instruct
0,<|user|>\nDevelop a method for clustering astr...
1,<|user|>\nWhat is the role of dopamine in the ...
2,"<|user|>\nReplace the \""XXX\"" in the following..."
3,<|user|>\nIdentify the sentence type (declarat...
4,<|user|>\nSuggest a book title for a fantasy n...


### Using InstructionDataset

In [9]:
dataset = InstructionDataset(df['instruct'].tolist())

In [10]:
loader = InstructionDataLoader()

### Using InstructionLoader without masking inputs (Phi3)

In [11]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [12]:
data = iter(dataloader)

In [13]:
first_batch = next(data)

In [14]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [15]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Name a popular form of transportation in France.
<|llm|>
Popular French Transportation<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoft

In [16]:
sample_response = first_batch[1][2]

In [17]:
print(tokenizer.decode_ids(sample_response))

|user|>
Name a popular form of transportation in France.
<|llm|>
Popular French Transportation<|endoftext|>


In [18]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [19]:
data = iter(dataloader)

In [20]:
first_batch = next(data)

In [21]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [22]:
print(tokenizer.decode_ids(sample_input))

<|user|>
What type of triangle has three congruent sides?
<|llm|>
Congruent Sided Triangles<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [23]:
sample_response = first_batch[1][2]

In [24]:
print(tokenizer.decode_ids(sample_response))

|user|>
What type of triangle has three congruent sides?
<|llm|>
Congruent Sided Triangles<|endoftext|>


### Using InstructionLoader with masking inputs (Phi3)

In [25]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=True)

In [26]:
data = iter(dataloader)

In [27]:
first_batch = next(data)

In [28]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [29]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Find the main idea of the following passage: Climate change is an increasingly contentious global issue. It is already having a drastic effect on the environment and human lives. We are already seeing an increase in extreme weather events and rising sea levels, both of which have a major impact on the planet.
<|llm|>
Understanding Climate Change Impact


In [30]:
sample_response = first_batch[1][2]

In [31]:
print(tokenizer.decode_ids(sample_response))

Understanding Climate Change Impact<|endoftext|>


In [32]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [33]:
data = iter(dataloader)

In [34]:
first_batch = next(data)

In [35]:
sample_input = first_batch[0][2]

-100 is neglected by the tokenizer

In [36]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Determine the structure of this poem: A melody of bright June
Birds gathering and singing in tune
The unbounded joy of dreams coming true
Lingering in moment and no time to lose
<|llm|>
June Melody Structure Analysis<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [37]:
sample_response = first_batch[1][2]

In [38]:
print(tokenizer.decode_ids(sample_response))

|user|>
Determine the structure of this poem: A melody of bright June
Birds gathering and singing in tune
The unbounded joy of dreams coming true
Lingering in moment and no time to lose
<|llm|>
June Melody Structure Analysis<|endoftext|>


### Creating a dataset using alpaca

In [39]:
prompt = "###You are a very smart llm and given the input query your job is to predict the title of the input.\n"

In [40]:
style = InstructionStyle.load(style='alpaca', prompt= prompt, input_query="### Input:\n", response_query="\n### Response:\n")

In [41]:
df = style.convert_train(chat_df, input_col='message', response_col= 'title', output_col='instruct', new_df=True)

In [42]:
df.head()

Unnamed: 0,instruct
0,###You are a very smart llm and given the inpu...
1,###You are a very smart llm and given the inpu...
2,###You are a very smart llm and given the inpu...
3,###You are a very smart llm and given the inpu...
4,###You are a very smart llm and given the inpu...


### Using InstructionDataset

In [43]:
dataset = InstructionDataset(df['instruct'].tolist())

In [44]:
loader = InstructionDataLoader()

### Using InstructionLoader without masking inputs (alpaca)

In [45]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [46]:
data = iter(dataloader)

In [47]:
first_batch = next(data)

In [48]:
sample_input = first_batch[0][2]

In [49]:
print(tokenizer.decode_ids(sample_input))

###You are a very smart llm and given the input query your job is to predict the title of the input.
### Input:
Edit the following sentence to correct any incorrect usage of the possessive form: That's my sister's and I car.
### Response:
Possessive Form Correction<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [50]:
sample_response = first_batch[1][2]

### Response without masking inputs alpaca

-100 is neglected by the tokenizer

In [51]:
print(tokenizer.decode_ids(sample_response))

You are a very smart llm and given the input query your job is to predict the title of the input.
### Input:
Edit the following sentence to correct any incorrect usage of the possessive form: That's my sister's and I car.
### Response:
Possessive Form Correction<|endoftext|>


### Using InstructionLoader without masking inputs (alpaca)

In [52]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=True)

In [53]:
data = iter(dataloader)

In [54]:
first_batch = next(data)

In [55]:
sample_input = first_batch[0][2]

In [56]:
print(tokenizer.decode_ids(sample_input))

###You are a very smart llm and given the input query your job is to predict the title of the input.
### Input:
### Response:


In [57]:
sample_response = first_batch[1][2]

### Response without masking inputs alpaca

-100 is neglected by the tokenizer

In [58]:
print(tokenizer.decode_ids(sample_response))

