In [1]:
import chathist
from chathist import Tokenizer, InstructionDataLoader,InstructionDataset, InstructionStyle
import pandas as pd
import torch

### Trying the same of chat dataset.

In [2]:
chat_df = pd.read_json("hf://datasets/ogrnz/chat-titles/dataset.jsonl", lines=True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
chat_df.columns

Index(['message', 'title'], dtype='object')

In [4]:
chat_df.head()

Unnamed: 0,message,title
0,Develop a method for clustering astronomical d...,Astronomical Data Clustering Method
1,What is the role of dopamine in the brain as a...,Dopamine as a Neurotransmitter
2,"Replace the \""XXX\"" in the following sentence ...",Company Sustainability Objective
3,"Identify the sentence type (declarative, inter...",Sentence Type Identification
4,Suggest a book title for a fantasy novel about...,Outlaws in Enchanted Realms


### Setting tokenizer

In [5]:
tokenizer = Tokenizer()

### Setting the config for tokenizer

In [6]:
chathist.config.set_tokenizer(tokenizer=tokenizer)
chathist.config.set_dtype(torch.int32)

### Creating a dataset using phi3

In [7]:
style = InstructionStyle.load(style='phi3', input_query="<|user|>\n", response_query="\n<|llm|>\n")

In [8]:
df = style.convert_train(chat_df, input_col='message', response_col= 'title', output_col='instruct', new_df=True)

In [9]:
df.head()

Unnamed: 0,instruct
0,<|user|>\nDevelop a method for clustering astr...
1,<|user|>\nWhat is the role of dopamine in the ...
2,"<|user|>\nReplace the \""XXX\"" in the following..."
3,<|user|>\nIdentify the sentence type (declarat...
4,<|user|>\nSuggest a book title for a fantasy n...


### Using InstructionDataset

In [10]:
dataset = InstructionDataset(df['instruct'].tolist())

In [11]:
loader = InstructionDataLoader()

### Using InstructionLoader without masking inputs (Phi3)

In [12]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [13]:
data = iter(dataloader)

In [14]:
first_batch = next(data)

In [15]:
sample_input = first_batch[0][2]

In [16]:
print(tokenizer.decode_ids(sample_input))

<|user|>
Name three popular authors from the 20th century.
<|llm|>
Popular 20th Century Authors<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoft

In [17]:
sample_response = first_batch[1][2]

In [18]:
print(tokenizer.decode_ids(sample_response))

|user|>
Name three popular authors from the 20th century.
<|llm|>
Popular 20th Century Authors<|endoftext|>


### Creating a dataset using alpaca

In [38]:
prompt = "###You are a very smart llm and given the input query your job is to predict the title of the input.\n"

In [39]:
style = InstructionStyle.load(style='alpaca', prompt= prompt, input_query="### Input:\n", response_query="\n### Response:\n")

In [40]:
df = style.convert_train(chat_df, input_col='message', response_col= 'title', output_col='instruct', new_df=True)

In [41]:
df.head()

Unnamed: 0,instruct
0,###You are a very smart llm and given the inpu...
1,###You are a very smart llm and given the inpu...
2,###You are a very smart llm and given the inpu...
3,###You are a very smart llm and given the inpu...
4,###You are a very smart llm and given the inpu...


### Using InstructionDataset

In [42]:
dataset = InstructionDataset(df['instruct'].tolist())

In [43]:
loader = InstructionDataLoader()

### Using InstructionLoader without masking inputs (alpaca)

In [44]:
dataloader = loader.load(dataset=dataset, batch_size=8, shuffle=True, drop_last=True, mask_input=False)

In [45]:
data = iter(dataloader)

In [46]:
first_batch = next(data)

In [47]:
sample_input = first_batch[0][2]

In [48]:
print(tokenizer.decode_ids(sample_input))

###You are a very smart llm and given the input query your job is to predict the title of the input.
### Input:
What are good voice-to-text software that works locally (no internet/Cloud) ?
### Response:
Local Voice-to-Text Software<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [49]:
sample_response = first_batch[1][2]

In [50]:
print(tokenizer.decode_ids(sample_response))

You are a very smart llm and given the input query your job is to predict the title of the input.
### Input:
What are good voice-to-text software that works locally (no internet/Cloud) ?
### Response:
Local Voice-to-Text Software<|endoftext|>


### Using InstructionDataLoader with input masking.(Should Implement)