In [1]:
# Import necessary libraries
import os
import re
import nltk
import spacy
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')


ModuleNotFoundError: No module named 'spacy'

In [None]:
# Example text data (replace with actual dataset)
texts = [
    "Once upon a time, there was a brave knight.",
    "In a galaxy far, far away, there was a great adventure."
]

# Save the texts in a file (optional, for demonstration purposes)
with open('data.txt', 'w') as f:
    for text in texts:
        f.write(text + '\n')


In [None]:
# Preprocessing function
def preprocess_text(text):
    # Remove noise and special characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize text
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

# Preprocess the texts
preprocessed_texts = [preprocess_text(text) for text in texts]

# Print the preprocessed texts
print(preprocessed_texts)


In [None]:
# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Encode the preprocessed texts
inputs = tokenizer(preprocessed_texts, return_tensors='pt', padding=True, truncation=True)

# Split data into training and validation sets
train_inputs, val_inputs = train_test_split(inputs.input_ids, test_size=0.2, random_state=42)


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_inputs,
    eval_dataset=val_inputs
)

# Train the model
trainer.train()


In [None]:
# Function to generate text
def generate_text(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(inputs.input_ids, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
prompt = "In a land far away"
generated_text = generate_text(prompt)
print(generated_text)


In [None]:
# Example evaluation (using BLEU score) with trials
from nltk.translate.bleu_score import sentence_bleu

# Define a reference text (replace with actual reference text)
reference = ["In a land far away, there was a beautiful princess."]

# Generate a text based on a prompt
generated = generate_text("In a land far away")

# Trial: Tokenize the reference and generated texts
reference_tokens = [nltk.word_tokenize(ref) for ref in reference]
generated_tokens = nltk.word_tokenize(generated)
print("Reference tokens:", reference_tokens)
print("Generated tokens:", generated_tokens)

# Calculate BLEU score
bleu_score = sentence_bleu(reference_tokens, generated_tokens)
print(f"BLEU score: {bleu_score}")

# Trial: Adjust prompt and evaluate again
new_prompt = "In a distant kingdom"
new_generated = generate_text(new_prompt)
new_generated_tokens = nltk.word_tokenize(new_generated)
new_bleu_score = sentence_bleu(reference_tokens, new_generated_tokens)
print(f"New BLEU score with adjusted prompt: {new_bleu_score}")
