In [None]:
import pandas as pd
from llm.processing import InstructionStyle, InstructionLoader, Tokenizer, Model
from llm.gpt2_model import GPT2
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chat_df = pd.read_json("hf://datasets/ogrnz/chat-titles/dataset.jsonl", lines=True)

In [3]:
chat_df

Unnamed: 0,message,title
0,Develop a method for clustering astronomical d...,Astronomical Data Clustering Method
1,What is the role of dopamine in the brain as a...,Dopamine as a Neurotransmitter
2,"Replace the \""XXX\"" in the following sentence ...",Company Sustainability Objective
3,"Identify the sentence type (declarative, inter...",Sentence Type Identification
4,Suggest a book title for a fantasy novel about...,Outlaws in Enchanted Realms
...,...,...
9995,"From the given passage, identify the main idea...",Exploring the Night Sky
9996,Analyze an effect that the text could have on ...,Impact on Audience Analysis
9997,What is the average number of hours of sleep a...,Recommended Sleep Duration
9998,Classify the following statement as either a D...,Statement Classification Query


In [4]:
train_df = chat_df.iloc[:9500, :]
test_df = chat_df.iloc[9500:, :]

In [5]:
instruct_train = InstructionStyle(train_df, 'message', 'title')

In [6]:
df = instruct_train.convert('phi3', input_query='<input>\n', response_query='<response>\n')

In [7]:
print(df.iloc[0, :].values[0])

<input>
Develop a method for clustering astronomical data.
<response>
Astronomical Data Clustering Method


In [25]:
df.head(10)

Unnamed: 0,instruct_data
0,<input>\nDevelop a method for clustering astro...
1,<input>\nWhat is the role of dopamine in the b...
2,"<input>\nReplace the \""XXX\"" in the following ..."
3,<input>\nIdentify the sentence type (declarati...
4,<input>\nSuggest a book title for a fantasy no...
5,<input>\nName three endangered animal species....
6,<input>\nCompose an email to the marketing tea...
7,"<input>\nGiven a list of words, group them int..."
8,<input>\nExplain the pros and cons of using a ...
9,<input>\nHow does the positioning of body part...


In [9]:
tokenizer = Tokenizer()

In [10]:
instruct_loader = InstructionLoader(df, tokenizer)

In [11]:
train, val = instruct_loader.load(0.1, 8)

In [12]:
data_iter = iter(train)

In [13]:
first_batch = next(data_iter)


In [14]:
first_batch[0][3].numpy()

array([   27, 15414,    29,   198, 14832,   428,  7072,   290,  1577,
         257,  7955,   503,   286,   642,  5788,    25,   314,  8672,
        1717,  9892,   338, 49967,   702, 34814,   938,  1755,   290,
         550,   257,  9623,  9799,    13,   198,    27, 26209,    29,
         198, 19452,  2899,   415,  6602, 19390, 50256, 50256, 50256,
       50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
       50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
       50256])

In [15]:
first_batch[1][3].numpy()

array([15414,    29,   198, 14832,   428,  7072,   290,  1577,   257,
        7955,   503,   286,   642,  5788,    25,   314,  8672,  1717,
        9892,   338, 49967,   702, 34814,   938,  1755,   290,   550,
         257,  9623,  9799,    13,   198,    27, 26209,    29,   198,
       19452,  2899,   415,  6602, 19390, 50256,  -100,  -100,  -100,
        -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
        -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
        -100])

In [26]:
print(tokenizer.decode_ids(first_batch[0][2].numpy()))

<input>
Re-write the following speech inorder to make it more persuasive: Climate change is an issue that can no longer be ignored. We must take action to limit the damage on our planet.
<response>
Persuasive Climate Change Message<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [17]:
gpt2 = GPT2()

In [18]:
gpt2.load_weights()

Loading weights


In [19]:
model = Model('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
optimizer = torch.optim.AdamW(gpt2.parameters(), lr=0.0004, weight_decay=0.1)

In [21]:
# model, train_loss, val_loss = model.train(gpt2, train, val, 1, optimizer)

In [23]:
out = model.generate(gpt2, torch.tensor(tokenizer.encode_text('Artificial Intelligence is')).unsqueeze(dim=0), max_new_tokens=25, temperature=1.2, top_k=5)

In [24]:
tokenizer.decode_ids(*out.numpy())

'Artificial Intelligence is the next great frontier in the world of technology.\n\nThis is because artificial intelligence can help in many areas, like the'