In [1]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset
from torch.utils.data import random_split

  from .autonotebook import tqdm as notebook_tqdm
2025-01-05 13:55:32.849874: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736114132.861105  207488 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736114132.864614  207488 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-05 13:55:32.878167: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
window_size = 128
stride = 1
embedding_dim = 512
num_layers = 24
num_heads = 16
ff_dim = 1024
train_ratio = 0.8

In [3]:
model_name = "EleutherAI/gpt-neo-125M"
model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [4]:
tokenizer.pad_token = tokenizer.eos_token

In [5]:
def load_essays_from_folder(folder_path):
    essays = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                essays.append(f.read())
    return essays 

essays = load_essays_from_folder('rawdata/essays')
essays[:5]

['I remember the first time I grasped the power of natural language processing (NLP). In 9th grade, I was toying around with Google Translate when I began wondering how it works. To an end user, it might look simple—type in a sentence, hit enter, and voilà, the machine gives you a translation for any of its 234 languages. I couldn\'t help but think, How is this even possible? Language is so messy and full of nuances.\nThis fascination resonated with my love for math—I find myself searching for numerical patterns, whether factoring street numbers or playing "24" with words (where A=1, B=2, and so on). Through NLP, I discovered how computers could transform language into mathematical representations, combining my love of patterns with real-world impact.\nMy passion grew through hands-on projects. After completing Stanford\'s NLP course on Coursera, I sought mentorship from Krishna Chintalapudi, a Principal Researcher at Microsoft, to improve text classification techniques. For Technology

In [6]:
def generate_sliding_window_data(sequence):
    inputs = []
    labels = []

    for i in range(0, len(sequence) - window_size, stride):
        window_sequence = sequence[i:i + window_size]
        next_tokens = sequence[i + 1:i + window_size + 1]  # Shift by one for labels
        inputs.append(window_sequence)
        labels.append(next_tokens)

    return inputs, labels

In [7]:
def prepare_data(essays):
    inputs = []
    labels = []

    for essay in essays:
        tokenized_essay = tokenizer(
                                    essay,
                                    padding="max_length",
                                    return_tensors="np",
                                )["input_ids"][0]
        essay_inputs, essay_labels = generate_sliding_window_data(tokenized_essay)
        inputs.extend(essay_inputs)
        labels.extend(essay_labels)

    return np.array(inputs), np.array(labels)

In [8]:
inputs, labels= prepare_data(essays)

In [9]:
inputs.shape, labels.shape

((94080, 128), (94080, 128))

In [10]:
class KittuSLMData(Dataset):
    def __init__(self, tokenized_inputs, tokenized_labels):
        self.input_ids = tokenized_inputs
        self.labels = tokenized_labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Create dataset
dataset = KittuSLMData(inputs, labels)

In [11]:
train_Size = int(train_ratio * len(dataset))
val_size = len(dataset) - train_Size

In [12]:
train_dataset, val_dataset = random_split(dataset, [train_Size, val_size])
len(train_dataset), len(val_dataset)

(75264, 18816)

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True
)



In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [15]:
trainer.train()

  2%|▏         | 100/4704 [00:33<25:41,  2.99it/s]

{'loss': 0.9669, 'grad_norm': 1.2045801877975464, 'learning_rate': 4.8937074829931974e-05, 'epoch': 0.02}


  4%|▍         | 200/4704 [01:07<25:42,  2.92it/s]

{'loss': 0.5589, 'grad_norm': 1.6572766304016113, 'learning_rate': 4.7874149659863945e-05, 'epoch': 0.04}


  6%|▋         | 300/4704 [01:41<25:58,  2.83it/s]

{'loss': 0.4172, 'grad_norm': 1.857711672782898, 'learning_rate': 4.6811224489795916e-05, 'epoch': 0.06}


  9%|▊         | 400/4704 [02:15<24:19,  2.95it/s]

{'loss': 0.2911, 'grad_norm': 2.28802752494812, 'learning_rate': 4.5748299319727895e-05, 'epoch': 0.09}


 11%|█         | 500/4704 [02:49<23:43,  2.95it/s]

{'loss': 0.2061, 'grad_norm': 1.130794644355774, 'learning_rate': 4.4685374149659866e-05, 'epoch': 0.11}


 13%|█▎        | 600/4704 [03:23<24:02,  2.84it/s]

{'loss': 0.177, 'grad_norm': 1.2035126686096191, 'learning_rate': 4.362244897959184e-05, 'epoch': 0.13}


 15%|█▍        | 700/4704 [03:58<23:34,  2.83it/s]

{'loss': 0.1453, 'grad_norm': 0.6960482597351074, 'learning_rate': 4.255952380952381e-05, 'epoch': 0.15}


 17%|█▋        | 800/4704 [04:31<22:08,  2.94it/s]

{'loss': 0.1058, 'grad_norm': 1.1275807619094849, 'learning_rate': 4.149659863945579e-05, 'epoch': 0.17}


 19%|█▉        | 900/4704 [05:06<21:12,  2.99it/s]

{'loss': 0.0699, 'grad_norm': 1.9548039436340332, 'learning_rate': 4.043367346938776e-05, 'epoch': 0.19}


 21%|██▏       | 1000/4704 [05:40<20:20,  3.03it/s]

{'loss': 0.0685, 'grad_norm': 0.26908838748931885, 'learning_rate': 3.937074829931973e-05, 'epoch': 0.21}


 23%|██▎       | 1100/4704 [06:13<19:59,  3.01it/s]

{'loss': 0.0721, 'grad_norm': 0.5885570049285889, 'learning_rate': 3.83078231292517e-05, 'epoch': 0.23}


 26%|██▌       | 1200/4704 [06:47<19:37,  2.98it/s]

{'loss': 0.0615, 'grad_norm': 0.39010483026504517, 'learning_rate': 3.724489795918368e-05, 'epoch': 0.26}


 28%|██▊       | 1300/4704 [07:21<19:08,  2.96it/s]

{'loss': 0.053, 'grad_norm': 1.0771703720092773, 'learning_rate': 3.618197278911565e-05, 'epoch': 0.28}


 30%|██▉       | 1400/4704 [07:54<18:13,  3.02it/s]

{'loss': 0.045, 'grad_norm': 0.8651258945465088, 'learning_rate': 3.511904761904762e-05, 'epoch': 0.3}


 32%|███▏      | 1500/4704 [08:28<17:52,  2.99it/s]

{'loss': 0.0426, 'grad_norm': 1.138550043106079, 'learning_rate': 3.405612244897959e-05, 'epoch': 0.32}


 34%|███▍      | 1600/4704 [09:03<17:32,  2.95it/s]

{'loss': 0.0408, 'grad_norm': 0.6409257650375366, 'learning_rate': 3.2993197278911564e-05, 'epoch': 0.34}


 36%|███▌      | 1700/4704 [09:37<16:42,  3.00it/s]

{'loss': 0.0378, 'grad_norm': 0.7477817535400391, 'learning_rate': 3.193027210884354e-05, 'epoch': 0.36}


 38%|███▊      | 1800/4704 [10:11<16:19,  2.96it/s]

{'loss': 0.0351, 'grad_norm': 0.5872397422790527, 'learning_rate': 3.086734693877551e-05, 'epoch': 0.38}


 40%|████      | 1900/4704 [10:44<15:06,  3.09it/s]

{'loss': 0.0403, 'grad_norm': 0.9978460073471069, 'learning_rate': 2.9804421768707485e-05, 'epoch': 0.4}


 43%|████▎     | 2000/4704 [11:18<15:32,  2.90it/s]

{'loss': 0.0386, 'grad_norm': 0.5194665789604187, 'learning_rate': 2.8741496598639456e-05, 'epoch': 0.43}


 45%|████▍     | 2100/4704 [11:52<14:49,  2.93it/s]

{'loss': 0.0354, 'grad_norm': 0.6175212264060974, 'learning_rate': 2.767857142857143e-05, 'epoch': 0.45}


 47%|████▋     | 2200/4704 [12:25<13:31,  3.09it/s]

{'loss': 0.0366, 'grad_norm': 0.5463506579399109, 'learning_rate': 2.6615646258503402e-05, 'epoch': 0.47}


 49%|████▉     | 2300/4704 [12:58<12:54,  3.10it/s]

{'loss': 0.0282, 'grad_norm': 0.6239235401153564, 'learning_rate': 2.5552721088435377e-05, 'epoch': 0.49}


 51%|█████     | 2400/4704 [13:31<12:38,  3.04it/s]

{'loss': 0.0313, 'grad_norm': 1.7174289226531982, 'learning_rate': 2.448979591836735e-05, 'epoch': 0.51}


 53%|█████▎    | 2500/4704 [14:04<12:01,  3.05it/s]

{'loss': 0.031, 'grad_norm': 0.46412011981010437, 'learning_rate': 2.342687074829932e-05, 'epoch': 0.53}


 55%|█████▌    | 2600/4704 [14:37<11:32,  3.04it/s]

{'loss': 0.0278, 'grad_norm': 0.6976783275604248, 'learning_rate': 2.2363945578231294e-05, 'epoch': 0.55}


 57%|█████▋    | 2700/4704 [15:09<11:12,  2.98it/s]

{'loss': 0.0273, 'grad_norm': 0.00035124149871990085, 'learning_rate': 2.1301020408163266e-05, 'epoch': 0.57}


 60%|█████▉    | 2800/4704 [15:42<10:13,  3.10it/s]

{'loss': 0.0281, 'grad_norm': 0.4442143440246582, 'learning_rate': 2.023809523809524e-05, 'epoch': 0.6}


 62%|██████▏   | 2900/4704 [16:14<09:46,  3.07it/s]

{'loss': 0.028, 'grad_norm': 0.47989848256111145, 'learning_rate': 1.9175170068027212e-05, 'epoch': 0.62}


 64%|██████▍   | 3000/4704 [16:47<09:15,  3.07it/s]

{'loss': 0.0293, 'grad_norm': 0.5651704668998718, 'learning_rate': 1.8112244897959187e-05, 'epoch': 0.64}


 66%|██████▌   | 3100/4704 [17:20<08:37,  3.10it/s]

{'loss': 0.0293, 'grad_norm': 0.7169039845466614, 'learning_rate': 1.7049319727891158e-05, 'epoch': 0.66}


 68%|██████▊   | 3200/4704 [17:53<08:14,  3.04it/s]

{'loss': 0.0277, 'grad_norm': 0.21078425645828247, 'learning_rate': 1.5986394557823133e-05, 'epoch': 0.68}


 70%|███████   | 3300/4704 [18:25<07:35,  3.08it/s]

{'loss': 0.0278, 'grad_norm': 0.6674048900604248, 'learning_rate': 1.4923469387755104e-05, 'epoch': 0.7}


 72%|███████▏  | 3400/4704 [18:58<07:21,  2.95it/s]

{'loss': 0.0243, 'grad_norm': 0.9188097715377808, 'learning_rate': 1.3860544217687074e-05, 'epoch': 0.72}


 74%|███████▍  | 3500/4704 [19:31<06:34,  3.05it/s]

{'loss': 0.0261, 'grad_norm': 0.53313148021698, 'learning_rate': 1.2797619047619047e-05, 'epoch': 0.74}


 77%|███████▋  | 3600/4704 [20:03<05:54,  3.12it/s]

{'loss': 0.025, 'grad_norm': 0.6938477158546448, 'learning_rate': 1.1734693877551021e-05, 'epoch': 0.77}


 79%|███████▊  | 3700/4704 [20:36<05:21,  3.12it/s]

{'loss': 0.0257, 'grad_norm': 0.617882251739502, 'learning_rate': 1.0671768707482993e-05, 'epoch': 0.79}


 81%|████████  | 3800/4704 [21:09<05:01,  3.00it/s]

{'loss': 0.024, 'grad_norm': 0.6341880559921265, 'learning_rate': 9.608843537414966e-06, 'epoch': 0.81}


 83%|████████▎ | 3900/4704 [21:41<04:20,  3.09it/s]

{'loss': 0.0255, 'grad_norm': 0.3389674425125122, 'learning_rate': 8.545918367346939e-06, 'epoch': 0.83}


 85%|████████▌ | 4000/4704 [22:14<03:49,  3.07it/s]

{'loss': 0.026, 'grad_norm': 0.5443024039268494, 'learning_rate': 7.482993197278912e-06, 'epoch': 0.85}


 87%|████████▋ | 4100/4704 [22:47<03:15,  3.10it/s]

{'loss': 0.0253, 'grad_norm': 0.6722580194473267, 'learning_rate': 6.420068027210885e-06, 'epoch': 0.87}


 89%|████████▉ | 4200/4704 [23:19<02:42,  3.10it/s]

{'loss': 0.024, 'grad_norm': 0.48607105016708374, 'learning_rate': 5.357142857142857e-06, 'epoch': 0.89}


 91%|█████████▏| 4300/4704 [23:52<02:17,  2.94it/s]

{'loss': 0.0237, 'grad_norm': 0.00017042089893948287, 'learning_rate': 4.29421768707483e-06, 'epoch': 0.91}


 94%|█████████▎| 4400/4704 [24:25<01:39,  3.06it/s]

{'loss': 0.0249, 'grad_norm': 0.4028944969177246, 'learning_rate': 3.231292517006803e-06, 'epoch': 0.94}


 96%|█████████▌| 4500/4704 [24:57<01:05,  3.10it/s]

{'loss': 0.0244, 'grad_norm': 0.375034362077713, 'learning_rate': 2.1683673469387757e-06, 'epoch': 0.96}


 98%|█████████▊| 4600/4704 [25:30<00:33,  3.11it/s]

{'loss': 0.0243, 'grad_norm': 0.45057255029678345, 'learning_rate': 1.1054421768707483e-06, 'epoch': 0.98}


100%|█████████▉| 4700/4704 [26:03<00:01,  3.08it/s]

{'loss': 0.0225, 'grad_norm': 0.3041726052761078, 'learning_rate': 4.251700680272109e-08, 'epoch': 1.0}


                                                   
100%|██████████| 4704/4704 [28:20<00:00,  3.04it/s]

{'eval_loss': 0.023786062374711037, 'eval_runtime': 134.2979, 'eval_samples_per_second': 140.106, 'eval_steps_per_second': 8.757, 'epoch': 1.0}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
100%|██████████| 4704/4704 [28:30<00:00,  2.75it/s]

{'train_runtime': 1710.7934, 'train_samples_per_second': 43.994, 'train_steps_per_second': 2.75, 'train_loss': 0.0894479629057808, 'epoch': 1.0}





TrainOutput(global_step=4704, training_loss=0.0894479629057808, metrics={'train_runtime': 1710.7934, 'train_samples_per_second': 43.994, 'train_steps_per_second': 2.75, 'total_flos': 4914872743624704.0, 'train_loss': 0.0894479629057808, 'epoch': 1.0})

In [18]:
prompt = "I remember the first time I grasped the power of natural language processing (NLP). In 9th grade, I was toying around with Google Translate when I began wondering how it works."
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

output = model.generate(
    input_ids=input_ids,
    max_length=256,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I remember the first time I grasped the power of natural language processing (NLP). In 9th grade, I was toying around with Google Translate when I began wondering how it works. an an, anof,of isofd that, an conducted, turned, took place Google a,,,,a,,,,,,,,,,,,,,,,,,,,,,,, Googlelate,., an a where, was it an about whether to or not.in an inth, inth, at, place I to, that my showed, me the, that I the, that me the I create
