In [43]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset

In [44]:
# Load the CSV file
df = pd.read_csv('processed_data/notes.csv')

# Function to safely convert content to string
def safe_str(x):
    if pd.isna(x):
        return ""
    elif isinstance(x, (int, float)):
        return str(int(x))  # Convert numbers to integers before string conversion
    else:
        return str(x)

# Combine all content into a single text file
try:
    with open('processed_data/notes.txt', 'w', encoding='utf-8') as f:
        for content in df['content'].apply(safe_str):
            f.write(content + '\n\n')  # Add two newlines between entries
    print("Successfully created notes.txt")
except Exception as e:
    print(f"Error while writing combined file: {e}")
    print("First few rows of 'content' column:")
    print(df['content'].head())
    raise

Successfully created notes.txt


In [45]:
# Device configuration with fallback
# This has to do with setting what Chip Architecture stuff. Changing to M1Chip
def get_device():
    if torch.backends.mps.is_available():
        try:
            # Test MPS availability
            _ = torch.zeros(1).to(torch.device("mps"))
            return torch.device("mps")
        except RuntimeError:
            print("MPS device found but not compatible. Falling back to CPU.")
    if torch.cuda.is_available():
        return torch.device("cuda")
    return torch.device("cpu")

device = get_device()
print(f"Using device: {device}")

# Load pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print('tokenizer:::')
    print(tokenizer)
    model.resize_token_embeddings(len(tokenizer))

# Ensure the pad token is set
model.config.pad_token_id = tokenizer.pad_token_id

Using device: mps


In [46]:
# Prepare your data
def get_dataset(file_path, tokenizer):
    try:
        dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=128)
        return dataset
    except Exception as e:
        print(f"Error creating dataset: {e}")
        raise

train_dataset = get_dataset("processed_data/notes.txt", tokenizer)



In [47]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

100%|██████████| 12/12 [00:10<00:00,  1.11it/s]

{'train_runtime': 10.8031, 'train_samples_per_second': 4.165, 'train_steps_per_second': 1.111, 'train_loss': 35.99178568522135, 'epoch': 3.0}





TrainOutput(global_step=12, training_loss=35.99178568522135, metrics={'train_runtime': 10.8031, 'train_samples_per_second': 4.165, 'train_steps_per_second': 1.111, 'total_flos': 2939535360000.0, 'train_loss': 35.99178568522135, 'epoch': 3.0})

In [48]:
# Save the fine-tuned model
model.save_pretrained("./llm/fine_tuned_notes_GPT2")
tokenizer.save_pretrained("./llm/fine_tuned_notes_GPT2")

def generate_text(prompt, model, tokenizer, device, max_new_tokens=50):
    # Encode the input
    inputs = tokenizer.encode_plus(
        prompt,
        add_special_tokens=True,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate text
    try:
        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
            )
    except RuntimeError as e:
        print(f"Error during generation: {e}")
        print("Falling back to CPU for text generation.")
        model.to("cpu")
        input_ids = input_ids.to("cpu")
        attention_mask = attention_mask.to("cpu")
        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
            )
        model.to(device)  # Move the model back to the original device

    # Decode and return the generated text
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [54]:
# Example usage with the new prompt
prompt = "What is the meaning of life?"
generated_text = generate_text(prompt, model, tokenizer, device)
print(f"Prompt: {prompt}")
print(f"Generated text: {generated_text}")

# Generate multiple responses
num_responses = 3
print(f"\nGenerating {num_responses} different responses:")
for i in range(num_responses):
    generated_text = generate_text(prompt, model, tokenizer, device)
    print(f"Response {i+1}: {generated_text}")

Prompt: What is the meaning of life?
Generated text: What is the meaning of life?A:The meaningofLife:LifeThemeaningofThelife:life(Life):Life(life)lifelifeLifeLifeLion:Lives(L)LifeLine:Line(Line)LineLance:LeLineLe

Generating 3 different responses:
Response 1: What is the meaning of life?is the which is athe meaningofistheisisThe meaningismeaningoflifeis Themeaning oflifeIs themeaningOflifeThemeaningisItisSThe Meaning ofofThethemeaningThesenseTheI,theI
Response 2: What is the meaning of life? the life is, the death of,the death isof,lifeoflifeis,Thelife ofThedeath ofthedeathofthelifeOfTheDeathofTheGodTheTheHeavensofGodGodtheGodIsTheSheaven
Response 3: What is the meaning of life? isthe meaningoflife is.




The way isThewayis thewayIsthewayI'mtheWayIsTheWayIwasTheistheIisTheIwillIWillIshallIhaveIknowIwould
