In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

2025-09-26 11:41:34.563407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758886894.898814      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758886895.004986      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [2]:
text = "My name is"
encoded_input = tokenizer(text, return_tensors='pt')
generated_text = tokenizer.decode(model.generate(encoded_input['input_ids'], max_new_tokens = 9)[0])
generated_text

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


"My name is John. I'm a man of God."

In [3]:
import pandas as pd 
import tiktoken 
tokenizer = tiktoken.get_encoding("gpt2")
df = pd.read_csv("/kaggle/input/spam-data/spam.csv", encoding = "latin1")
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
df['label'] = df['v1'].map({'ham': 0, 'spam': 1})
df['text'] = df['v2']
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'v1', 'v2'])
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['label'].value_counts()

label
0    4825
1     747
Name: count, dtype: int64

In [6]:
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_ratio = int(0.7 * len(shuffled_df)) 
val_ratio = train_ratio + int(0.2 * len(shuffled_df))

train_df = shuffled_df[:train_ratio]
val_df = shuffled_df[train_ratio:val_ratio]
test_df = shuffled_df[val_ratio:]

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

print(train_df.head())
print(val_df.head())
print(test_df.head())

train_df.to_csv("train.csv", index = False)
val_df.to_csv("val.csv", index = False)
test_df.to_csv("test.csv", index = False) 

Train set size: 3900
Validation set size: 1114
Test set size: 558
   label                                               text
0      0  Funny fact Nobody teaches volcanoes 2 erupt, t...
1      0  I sent my scores to sophas and i had to do sec...
2      1  We know someone who you know that fancies you....
3      0  Only if you promise your getting out as SOON a...
4      1  Congratulations ur awarded either å£500 of CD ...
      label                                               text
3900      1  You have won a Nokia 7250i. This is what you g...
3901      1  Sorry! U can not unsubscribe yet. THE MOB offe...
3902      0          X2  &lt;#&gt; . Are you going to get that
3903      1  network operator. The service is free. For T &...
3904      0  Is there coming friday is leave for pongal?do ...
      label                                               text
5014      0                Yeah why not, is the gang all ready
5015      0             No message..no responce..what happend?
5016   

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from typing import Tuple # Import Tuple

class SpamDataset(Dataset):
  def __init__(self, csv_file:str, tokenizer:tiktoken.core.Encoding, max_length = None, pad_token_id:int=50256) -> None:
    super().__init__()
    self.csv_data = pd.read_csv(csv_file)
    self.encoded_texts = [tokenizer.encode(text) for text in self.csv_data['text']]
    if max_length is None:
      self.max_length = self._longest_encoded_length()
    else:
      self.max_length = max_length
      self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]
    self.encoded_texts = [encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts]

  def __len__(self) -> int:
    return len(self.csv_data)

  def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]:
    encoded = self.encoded_texts[idx]
    label = self.csv_data.iloc[idx]["label"]
    return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

  def _longest_encoded_length(self)->int:
    max_length = 0
    for encoded_text in self.encoded_texts:
      encoded_length = len(encoded_text)
      if encoded_length > max_length:
        max_length = encoded_length
    return max_length

In [8]:
train_dataset= SpamDataset(csv_file="/kaggle/working/train.csv",max_length=None,tokenizer=tokenizer)
test_dataset= SpamDataset(csv_file="/kaggle/working/test.csv",max_length=None,tokenizer=tokenizer)
val_dataset=SpamDataset(csv_file="/kaggle/working/val.csv",max_length=None,tokenizer=tokenizer)

In [9]:
from torch.utils.data import DataLoader

num_workers= 0
batch_size=8
train_loader=DataLoader(
    dataset=train_dataset,
    shuffle=True,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True
)
test_loader=DataLoader(
    dataset=test_dataset,
    shuffle=True,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)
val_loader=DataLoader(
    dataset=val_dataset,
    shuffle=True,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

In [10]:
#Parametreleri donduralım zaten eğitilmiş
for param in model.parameters():
  param_requires_grad = False

In [11]:
import torch.nn as nn
num_class = 2
model.classifier = nn.Linear(in_features=50257, out_features=num_class, bias=False)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  (classifier): Linear(in_features=50257, out_features=

In [12]:
def train_model(model, train_dataloader, val_dataloader, optimizer, loss_fn, device, epochs):
    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for text, label in train_dataloader:
            text = text.to(device)
            label = label.to(device)

            optimizer.zero_grad()
            # Forward pass
            outputs = model(text)
            logits = outputs.logits
            classification_logits = model.classifier(logits[:, -1, :])

            # Loss
            batch_loss = loss_fn(classification_logits, label)
            batch_loss.backward()
            optimizer.step()

            train_loss += batch_loss.item()

            # Accuracy
            preds = classification_logits.argmax(dim=1)
            train_correct += (preds == label).sum().item()
            train_total += label.size(0)

        avg_train_loss = train_loss / len(train_dataloader)
        train_accuracy = train_correct / train_total
        print(f"Epoch: {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f}")

        # Validation
        if val_dataloader is not None:
            model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0
            with torch.no_grad():
                for text, label in val_dataloader:
                    text = text.to(device)
                    label = label.to(device)

                    outputs = model(text)
                    logits = outputs.logits
                    classification_logits = model.classifier(logits[:, -1, :])

                    val_loss += loss_fn(classification_logits, label).item()

                    preds = classification_logits.argmax(dim=1)
                    val_correct += (preds == label).sum().item()
                    val_total += label.size(0)

            avg_val_loss = val_loss / len(val_dataloader)
            val_accuracy = val_correct / val_total
            print(f"Validation Loss: {avg_val_loss:.4f} | Validation Acc: {val_accuracy:.4f}")


In [13]:
train_model(
    model,
    train_loader,
    val_loader,
    optimizer=torch.optim.AdamW(model.parameters(), lr=1e-3),
    loss_fn=nn.CrossEntropyLoss(),
    device='cuda' if torch.cuda.is_available() else 'cpu',
    epochs=10
)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch: 1/10 | Train Loss: 45.2709 | Train Acc: 0.7970
Validation Loss: 0.3512 | Validation Acc: 0.8959
Epoch: 2/10 | Train Loss: 0.8161 | Train Acc: 0.9289
Validation Loss: 1.8506 | Validation Acc: 0.9749
Epoch: 3/10 | Train Loss: 7.8681 | Train Acc: 0.9559
Validation Loss: 2.7765 | Validation Acc: 0.9740
Epoch: 4/10 | Train Loss: 1.1738 | Train Acc: 0.9772
Validation Loss: 0.4992 | Validation Acc: 0.9794
Epoch: 5/10 | Train Loss: 0.6738 | Train Acc: 0.9792
Validation Loss: 1.1088 | Validation Acc: 0.9811
Epoch: 6/10 | Train Loss: 219.1382 | Train Acc: 0.9720
Validation Loss: 125.9889 | Validation Acc: 0.9749
Epoch: 7/10 | Train Loss: 17.4881 | Train Acc: 0.9833
Validation Loss: 15.9788 | Validation Acc: 0.9776
Epoch: 8/10 | Train Loss: 4.1706 | Train Acc: 0.9861
Validation Loss: 4.9801 | Validation Acc: 0.9856
Epoch: 9/10 | Train Loss: 1.0411 | Train Acc: 0.9926
Validation Loss: 2.3451 | Validation Acc: 0.9811
Epoch: 10/10 | Train Loss: 0.8912 | Train Acc: 0.9928
Validation Loss: 6.98

In [19]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
    model.eval()

    # Prepare inputs to the model
    input_ids = tokenizer.encode(text)
    supported_context_length = model.transformer.wte.weight.shape[0]
    # Note: In the book, this was originally written as pos_emb.weight.shape[1] by mistake
    # It didn't break the code but would have caused unnecessary truncation (to 768 instead of 1024)

    # Truncate sequences if they too long
    input_ids = input_ids[:min(max_length, supported_context_length)]

    # Pad sequences to the longest sequence
    input_ids += [pad_token_id] * (max_length - len(input_ids))
    input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # add batch dimension

    # Model inference
    with torch.no_grad():
        outputs = model(input_tensor)
        logits = outputs.logits[:, -1, :]  # ModelOutput içinden logits’i alın
    predicted_label = torch.argmax(logits, dim=-1).item()

    # Return the classified result
    return "spam" if predicted_label == 1 else "not spam"

In [21]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
text_1 = (
    "How are you doing ? Are you available at 8?"
)

print(classify_review(
    text_1, model, tokenizer, device, max_length=train_dataset.max_length
))

text_2 = (
    "Hey, just wanted to check if we're still on"
    " for dinner tonight? Let me know!"
)

print(classify_review(
    text_2, model, tokenizer, device, max_length=train_dataset.max_length
))

not spam
not spam
