In [1]:
%pip install transformers datasets torch pandas numpy
%pip install transformers[torch]



You should consider upgrading via the '/Users/prasiddhapradhan/Desktop/FrankensteinCPUStarter/frankenstein_env/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
zsh:1: no matches found: transformers[torch]
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

df = pd.read_csv("frankenstein_chunks.csv")

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    481 non-null    object
dtypes: object(1)
memory usage: 3.9+ KB


Unnamed: 0,text
0,﻿The Project Gutenberg eBook of Frankenstein; ...
1,Further corrections by Menno de Leeuw.\n\n\n**...
2,"I am already far north of London, and as I wal..."
3,Its productions and features may be without ex...
4,But supposing all these conjectures to be fals...


STEP 1 CLEAN THE DATA

In [3]:
df = df[df['text'].str.contains("Project Gutenberg") == False]

#Remove empty spaces and reset index
df['text'] = df['text'].str.strip()
df = df.reset_index(drop=True)

#check the cleaned dataset
df.head()

Unnamed: 0,text
0,Further corrections by Menno de Leeuw.\n\n\n**...
1,"I am already far north of London, and as I wal..."
2,Its productions and features may be without ex...
3,But supposing all these conjectures to be fals...
4,You may remember that a\nhistory of all the vo...


STEP 2 CONVERT DATA INTO HUGGING FACE DATASET FORMAT

In [4]:
from datasets import Dataset

#Convert pandas dataframe to hugging face dataset
dataset = Dataset.from_pandas(df)

print(dataset)

Dataset({
    features: ['text'],
    num_rows: 464
})


STEP3: TOKENIZE THE DATA


In [5]:
from transformers import AutoTokenizer

# Load DistilGPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Set padding token (GPT-2 models don’t have one by default)
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Check tokenized output
print(tokenized_dataset[0])



Map:   0%|          | 0/464 [00:00<?, ? examples/s]

{'text': 'Further corrections by Menno de Leeuw.\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK FRANKENSTEIN; OR, THE MODERN PROMETHEUS ***\n\n\n\n\nFrankenstein;\n\nor, the Modern Prometheus\n\nby Mary Wollstonecraft (Godwin) Shelley\n\n\n CONTENTS\n\n Letter 1\n Letter 2\n Letter 3\n Letter 4\n Chapter 1\n Chapter 2\n Chapter 3\n Chapter 4\n Chapter 5\n Chapter 6\n Chapter 7\n Chapter 8\n Chapter 9\n Chapter 10\n Chapter 11\n Chapter 12\n Chapter 13\n Chapter 14\n Chapter 15\n Chapter 16\n Chapter 17\n Chapter 18\n Chapter 19\n Chapter 20\n Chapter 21\n Chapter 22\n Chapter 23\n Chapter 24\n\n\n\n\nLetter 1\n\n_To Mrs. Saville, England._\n\n\nSt. Petersburgh, Dec. 11th, 17—.\n\n\nYou will rejoice to hear that no disaster has accompanied the\ncommencement of an enterprise which you have regarded with such evil\nforebodings. I arrived here yesterday, and my first task is to assure\nmy dear sister of my welfare and increasing confidence in the success\nof my undertaking.', 'input_ids': [

STEP4: PREPARE FOR TRAINING


In [6]:
from transformers import DataCollatorForLanguageModeling

#Data collator for MLM(Masked Language Modeling)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

STEP5: LOAD DISTILGPT-2 MODEL

In [7]:
from transformers import AutoModelForCausalLM

#Load pre-trained distilGPT-2 model
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

STEP6: DEFINE TRAINING PARAMETERS


In [8]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./frankenstein_model",  # Save model here
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,  # Adjust if needed
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    do_train=True,
    do_eval=True
)

# Create trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Normally a separate validation set
    tokenizer=tokenizer,
    data_collator=data_collator
)



  trainer = Trainer(


STEP7: START TRAINING

In [9]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,3.9972,3.5943
2,3.7262,3.450395
3,3.6074,3.40551


TrainOutput(global_step=348, training_loss=3.7520053468901535, metrics={'train_runtime': 282.6821, 'train_samples_per_second': 4.924, 'train_steps_per_second': 1.231, 'total_flos': 181862538412032.0, 'train_loss': 3.7520053468901535, 'epoch': 3.0})

In [12]:
model.save_pretrained("./frankenstein_model")
tokenizer.save_pretrained("./frankenstein_model")

('./frankenstein_model/tokenizer_config.json',
 './frankenstein_model/special_tokens_map.json',
 './frankenstein_model/vocab.json',
 './frankenstein_model/merges.txt',
 './frankenstein_model/added_tokens.json',
 './frankenstein_model/tokenizer.json')

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("./frankenstein_model")
tokenizer = AutoTokenizer.from_pretrained("./frankenstein_model")


STEP8: EVALUATE THE MODEL

In [14]:
import math
from transformers import pipeline

# Load trained model
trained_model = AutoModelForCausalLM.from_pretrained("./frankenstein_model")

# Load tokenizer again
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Define evaluation function
def compute_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = trained_model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    return math.exp(loss)

# Example evaluation
sample_text = "It was on a dreary night of November that I beheld the accomplishment of my toils."
perplexity = compute_perplexity(sample_text)
print(f"Perplexity: {perplexity}")


Perplexity: 27.87530429372864


STEP9: GENERATE FRANKENSTEIN-STYLE TEXT

In [15]:
generator = pipeline("text-generation", model=trained_model, tokenizer=tokenizer)

# Generate text
prompt = "As the storm raged outside, the creature whispered"
output = generator(prompt, max_length=100, num_return_sequences=1)
print(output[0]['generated_text'])


Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


As the storm raged outside, the creature whispered to me the words of my father during my speech of gratitude.

“Why do you weep? Why do you lose sight of it?
My father exclaimed: ‘We must not have loved you more; we are so dear to you. But let me be with you, the old soul.’

“Then my father breathed as if it had long been so cruel, and if you’re not a stranger
