In [None]:
import pandas as pd
import torch
from transformers import LongformerForMaskedLM, LongformerTokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, IntervalStrategy
from datasets import Dataset
import os

from dotenv_vault import load_dotenv
load_dotenv()

In [None]:
# Load the CSV file
df = pd.read_csv(os.getenv('PROCESSED_NOTES_CSV'))

# Function to safely convert content to string
def safe_str(x):
    if pd.isna(x):
        return ""
    elif isinstance(x, (int, float)):
        return str(int(x))  # Convert numbers to integers before string conversion
    else:
        return str(x)

# Combine all content into a single text file
try:
    with open(os.getenv('PROCESSED_NOTES_TXT'), 'w', encoding='utf-8') as f:
        for content in df['content'].apply(safe_str):
            f.write(content + '\n\n')  # Add two newlines between entries
    print("Successfully created text file")
except Exception as e:
    print(f"Error while writing combined file: {e}")
    print("First few rows of 'content' column:")
    print(df['content'].head())
    raise

In [None]:
# Device configuration with fallback
# This has to do with setting what Chip Architecture stuff. Changing to M1Chip
def get_device():
    if torch.backends.mps.is_available():
        try:
            # Test MPS availability
            _ = torch.zeros(1).to(torch.device("mps"))
            return torch.device("mps")
        except RuntimeError:
            print("MPS device found but not compatible. Falling back to CPU.")
    if torch.cuda.is_available():
        return torch.device("cuda")
    return torch.device("cpu")

device = get_device()
print(f"Using device: {device}")

# Load pre-trained model and tokenizer
model_name = "allenai/longformer-base-4096"
model = LongformerForMaskedLM.from_pretrained(model_name)
tokenizer = LongformerTokenizer.from_pretrained(model_name)

# Set the pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Ensure the pad token is set
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# Prepare your data
def get_dataset(file_path, tokenizer):
    try:
        dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=128)
        return dataset
    except Exception as e:
        print(f"Error creating dataset: {e}")
        raise

train_dataset = get_dataset(os.getenv('PROCESSED_NOTES_TXT'), tokenizer)
eval_dataset = get_dataset(os.getenv('PROCESSED_NOTES_TXT'), tokenizer)

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir=get_dataset(os.getenv('RESULTS'),
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=1,  # Adjust based on your GPU memory
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=get_dataset(os.getenv('LOGS')',
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

In [None]:
# Save the fine-tuned model
model.save_pretrained(os.getenv('FINE_TUNED_FILE'))
tokenizer.save_pretrained(os.getenv('FINE_TUNED_FILE'))

def generate_text(prompt, model, tokenizer, device, max_new_tokens=99):
    # Encode the input
    inputs = tokenizer.encode_plus(
        prompt,
        add_special_tokens=True,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate text
    try:
        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
            )
    except RuntimeError as e:
        print(f"Error during generation: {e}")
        print("Falling back to CPU for text generation.")
        model.to("cpu")
        input_ids = input_ids.to("cpu")
        attention_mask = attention_mask.to("cpu")
        with torch.no_grad():
            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
            )
        model.to(device)  # Move the model back to the original device

    # Decode and return the generated text
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# Example usage with the new prompt
prompt = "How are computer science and physics similar?"
generated_text = generate_text(prompt, model, tokenizer, device)
print(f"Prompt: {prompt}")
print(f"Generated text: {generated_text}")

# Generate multiple responses
# num_responses = 3
# print(f"\nGenerating {num_responses} different responses:")
# for i in range(num_responses):
#     generated_text = generate_text(prompt, model, tokenizer, device)
#     print(f"Response {i+1}: {generated_text}")