In [1]:
%pip install torch transformers datasets

Note: you may need to restart the kernel to use updated packages.




In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2LMHeadModel, GPT2Tokenizer, GemmaTokenizer, BitsAndBytesConfig
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset, concatenate_datasets
from torch.optim import AdamW
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
from huggingface_hub import login
import sentencepiece
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import torch.optim as optim
from accelerate import infer_auto_device_map

Model v1.0

In [None]:
# Load the Wikitext dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")


In [None]:
# Convert custom lines into a dataset
custom_dataset = Dataset.from_dict({"text": custom_lines})

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

# Assign a padding token if not already present
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as the pad_token

In [None]:
# Tokenize function
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()  # Set the labels as input_ids
    return tokenized_inputs

# Tokenize Wikitext dataset
tokenized_wikitext = dataset.map(tokenize_function, batched=True)

# Tokenize custom dataset
custom_data = [{"text": line} for line in custom_lines]  # Convert your custom lines into a dictionary list
custom_dataset = Dataset.from_list(custom_data)  # Create a dataset from custom lines
tokenized_custom = custom_dataset.map(tokenize_function, batched=True)

# Combine the datasets using concatenate_datasets
combined_train_dataset = concatenate_datasets([tokenized_wikitext["train"], tokenized_custom])

# Convert to PyTorch tensors
combined_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
# Create DataLoader
def collate_fn(batch):
    return tokenizer.pad(batch, padding=True, return_tensors="pt")

train_dataloader = DataLoader(
    combined_train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=1,  # Disable multiprocessing to debug
    pin_memory=True
)

In [None]:
# Move the model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
# Set the model to training mode
model.train()

# Training loop
epochs = 3
scaler = GradScaler()  # Initialize the scaler for mixed precision

# Set gradient accumulation steps (adjust to simulate larger batches)
accumulation_steps = 16  # Simulates larger batch size

for epoch in range(epochs):
    loop = tqdm(train_dataloader, leave=True)

    optimizer.zero_grad()  # Reset the gradients before starting

    for step, batch in enumerate(loop):
        inputs = {key: val.to(device) for key, val in batch.items()}

        with torch.cuda.amp.autocast():  # Enable mixed precision
            outputs = model(**inputs)
            loss = outputs.loss / accumulation_steps  # Scale loss for accumulation

        scaler.scale(loss).backward()  # Backpropagate loss

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)  # Update weights
            scaler.update()
            optimizer.zero_grad()  # Reset gradients

print("Training complete!")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 2296/2296 [17:45<00:00,  2.15it/s]
100%|██████████| 2296/2296 [17:43<00:00,  2.16it/s]
100%|██████████| 2296/2296 [20:08<00:00,  1.90it/s]

Training complete!





In [None]:
#OLD
# Load the dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

model = GPT2LMHeadModel.from_pretrained("distilgpt2")
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

# Assign a padding token if not already present
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as the pad_token

# Tokenize the input text and set up labels
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()  # Set the labels as input_ids
    return tokenized_inputs

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Convert dataset to PyTorch tensors
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Create DataLoader
train_dataset = tokenized_datasets["train"]

def collate_fn(batch):
    return tokenizer.pad(batch, padding=True, return_tensors="pt")

train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=1,  # Disable multiprocessing to debug
    pin_memory=True
)

# Load the model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Move the model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Set the model to training mode
model.train()

# Training loop
epochs = 3
scaler = GradScaler()  # Initialize the scaler for mixed precision

# Set gradient accumulation steps (adjust to simulate larger batches)
accumulation_steps = 4  # Simulates larger batch size

for epoch in range(epochs):
    loop = tqdm(train_dataloader, leave=True)

    optimizer.zero_grad()  # Reset the gradients before starting

    for step, batch in enumerate(loop):
        inputs = {key: val.to(device) for key, val in batch.items()}

        with torch.cuda.amp.autocast():  # Enable mixed precision
            outputs = model(**inputs)
            loss = outputs.loss / accumulation_steps  # Scale loss for accumulation

        scaler.scale(loss).backward()  # Backpropagate loss

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)  # Update weights
            scaler.update()
            optimizer.zero_grad()  # Reset gradients

print("Training complete!")


  scaler = GradScaler()  # Initialize the scaler for mixed precision
  with torch.cuda.amp.autocast():  # Enable mixed precision
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 2295/2295 [47:35<00:00,  1.24s/it]
100%|██████████| 2295/2295 [42:19<00:00,  1.11s/it]
100%|██████████| 2295/2295 [42:27<00:00,  1.11s/it]

Training complete!





In [None]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# Load the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set pad token to eos token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Move model to the correct device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare input text and move to device
input_text = "Tell me about tesla"
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Create attention mask to differentiate between padding and actual data
attention_mask = torch.ones(inputs.shape, device=device)

# Generate text with repetition penalty
with torch.no_grad():
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=150,  # Increased length to avoid truncation
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.8,  # Slightly adjusted temperature
        top_k=50,         # Limit next token choices to top-k
        top_p=0.9,        # Use nucleus sampling
        repetition_penalty=1.2  # Penalty to discourage repetition
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)

Tell me about tesla.
I'm not sure if you know what I mean, but it's a very good name for the game and one that is really fun to play with friends or just have some time together in your own home (or maybe even on vacation). It has an interesting twist where each character can be played as either of two different characters who are fighting against their respective enemies at once! The story starts off pretty simple: You're playing this guy named TESLA from "The Legend of Zelda" series which was released back when Nintendo launched its first console called NX . He had been working out his new job after he got sick so decided to go into hiding because there were no jobs available anymore due all sorts people wanted


Model v2.0

In [3]:
# Define your custom dataset (replace with your own sentences)
custom_lines = [
    "Subject: Proposal for a Strategic Partnership Opportunity - Dear [Recipient's Name], I hope this email finds you well. My name is [Your Name], and I am [Your Position] at [Your Company]. I am reaching out to explore a potential partnership where we can combine our strengths for mutual growth. Let me know a suitable time to discuss further. Warm regards, [Your Name].",
    "Subject: Let's Collaborate for a Mutual Marketing Success - Dear [Recipient's Name], I am [Your Name] from [Your Company], and I would like to propose a co-marketing initiative to enhance visibility for both our brands. Our combined efforts can deliver great value to our audience. Looking forward to your response. Best regards, [Your Name].",
    "Subject: Proposal for Collaborative Product Development - Dear [Recipient's Name], At [Your Company], we admire your work in [specific area]. We see an opportunity to collaborate on developing a product that blends our expertise for market success. Please let me know a convenient time to discuss this further. Kind regards, [Your Name].",
    "Subject: Partnering for a Greener Future - Dear [Recipient's Name], Sustainability is integral to [Your Company], and we see [Recipient's Company] as a perfect partner to amplify our efforts. Together, we can tackle [specific issue]. I would love to discuss our shared goals. Warm regards, [Your Name].",
    "Subject: Let's Innovate Together - Dear [Recipient's Name], Your advancements in [specific technology] inspire us, and we believe a partnership between [Your Company] and [Recipient's Company] could lead to groundbreaking innovations. Let’s connect to discuss this exciting opportunity. Best regards, [Your Name].",
    "Subject: Exploring a Cross-Promotion Partnership - Dear [Recipient's Name], I am reaching out to propose a cross-promotion opportunity between [Your Company] and [Recipient's Company]. By collaborating, we can engage broader audiences and drive mutual growth. Let’s discuss the possibilities. Kind regards, [Your Name].",
    "Subject: Partnership Opportunity for Regional Expansion - Dear [Recipient's Name], As [Your Company] plans to expand in [specific region], we believe [Recipient's Company] would be an ideal partner given your strong presence in the area. Let’s discuss this exciting opportunity. Best regards, [Your Name].",
    "Subject: Let’s Host a Joint Event! - Dear [Recipient's Name], I am [Your Name], [Your Position] at [Your Company]. I propose a partnership to host a joint event or webinar showcasing our expertise in [specific topic]. Together, we can create high-value content. Looking forward to your thoughts. Warm regards, [Your Name].",
    "Subject: Partnering to Optimize Supply Chain - Dear [Recipient's Name], I am writing to discuss a potential partnership with [Recipient's Company] to streamline and optimize our supply chain operations for mutual benefits. Could we meet to explore this? Kind regards, [Your Name].",
    "Subject: Join Us in Shaping the Future of [Industry/Field] - Dear [Recipient's Name], [Your Company] is embarking on an R&D initiative in [specific area] and would love to collaborate with [Recipient's Company] to achieve groundbreaking advancements. Please let me know your availability. Best regards, [Your Name]."
]

# Convert to Hugging Face Dataset
custom_data = [{"text": line} for line in custom_lines]
custom_dataset = Dataset.from_list(custom_data)

In [4]:
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\Lab\.cache\huggingface\token
Login successful


In [5]:
# Load the tokenizer for GeMMA-2B
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")

def tokenize_function(examples):
    # Tokenize the examples
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    
    # Convert input_ids to tensor (if not already) and clone for labels
    tokenized_inputs["input_ids"] = torch.tensor(tokenized_inputs["input_ids"])
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()  # Ensure labels are set correctly
    
    return tokenized_inputs


# Apply tokenization to the dataset
tokenized_custom_dataset = custom_dataset.map(tokenize_function, batched=True)
tokenized_custom_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [6]:
# Load the GeMMA-2B model
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")

# Move model to GPU/TPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoader
train_dataloader = DataLoader(
    tokenized_custom_dataset,
    batch_size=2,  # Adjust based on GPU memory
    shuffle=True
)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Mixed precision setup
scaler = GradScaler()

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Initialize necessary components
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Ensure model is on the correct device
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)  # Or any other optimizer you're using
scaler = GradScaler()  # For mixed precision training

# Assuming train_dataloader is already set up (e.g., using DataLoader from torch.utils.data)

epochs = 3  # Adjust as needed
model.train()  # Set the model in training mode

accumulation_steps = 16  # Try increasing the number of accumulation steps
for epoch in range(epochs):
    loop = tqdm(train_dataloader, leave=True)
    optimizer.zero_grad()

    for step, batch in enumerate(loop):
        inputs = {key: val.to(device) for key, val in batch.items()}

        with torch.cuda.amp.autocast():  # Use mixed precision
            outputs = model(**inputs)
            loss = outputs['loss'] if 'loss' in outputs else compute_loss(outputs, inputs)

        # Backward pass
        scaler.scale(loss).backward()

        # Perform optimizer step after accumulation_steps
        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch + 1}")
        loop.set_postfix(loss=loss.item())

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Epoch 1: 100%|██████████| 5/5 [02:46<00:00, 33.20s/it, loss=11.7]
  0%|          | 0/5 [00:00<?, ?it/s]

: 

In [None]:
model.save_pretrained("fine_tuned_gemma_2b")
tokenizer.save_pretrained("fine_tuned_gemma_2b")


('fine_tuned_gemma_2b\\tokenizer_config.json',
 'fine_tuned_gemma_2b\\special_tokens_map.json',
 'fine_tuned_gemma_2b\\tokenizer.model',
 'fine_tuned_gemma_2b\\added_tokens.json',
 'fine_tuned_gemma_2b\\tokenizer.json')

In [None]:
# Specify your model's directory or name
model_name = "fine_tuned_gemma_2b"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-

In [None]:
def gen_tex(userinput):
    # Tokenize the input text with truncation, padding, and max_length handling
    tokenized_input = tokenizer(
        userinput,
        return_tensors="pt",  # Output PyTorch tensors
        truncation=True,
        padding=True,
        max_length=512
    )

    # Move input_ids and attention_mask to the same device as the model
    input_ids = tokenized_input['input_ids'].to(device)
    attention_mask = tokenized_input['attention_mask'].to(device)

    # Generate text using the model
    output = model.generate(
        input_ids=input_ids,               # Provide input_ids
        attention_mask=attention_mask,     # Optional but useful for padded sequences
        max_length=1000,                    # Adjust max_length for the desired output
        num_return_sequences=1,            # Generate one sequence
        no_repeat_ngram_size=2,            # Avoid repetition
        temperature=0.7,                   # Add randomness
        top_k=50,                          # Top-k sampling
        top_p=0.95,                        # Nucleus sampling
        do_sample=True,                    # Enable sampling
        early_stopping=True                # Stop if confident
    )

    # Decode the generated tokens into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

In [None]:
# User input handling
userinput = input("Enter your text: ").strip()

# Generate and print the output
print(gen_tex(userinput))

Give an example of a mail to Tesla for partnership.

The data in the table below are the amounts of crude oil imported into the United States from the Organization of Petroleum Exporting Countries (OPEC) for the years 1995–2015. (The years are given in order, with $t=5$ corresponding to 2896.)

<strong>a.</strong> Plot the data, letting $ t $ be the year going from $1$ to $30$ starting with the $ 5 $ in $2985 $. Use the window $ x $ from 0 to above $50 $ and $ y $from $00 3 $ above zero to about $40 $. <strong>b. </strong>Find a quartic function that models these data. Plot both the function and the points on the same axes. How well does the quaric model from part a fit the given data? 

$x$ is the number of years after $905$, $y$is the amount of import $(in millions of barrels per day), where $x = 4$ corresponds to the start of 910.


Model v3.0

In [10]:
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\Lab\.cache\huggingface\token
Login successful


In [11]:
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/Sheared-LLaMA-1.3B")
model2 = AutoModelForCausalLM.from_pretrained("princeton-nlp/Sheared-LLaMA-1.3B")

In [13]:
# Define your custom dataset (replace with your own sentences)
data = [
    "Subject: Proposal for a Strategic Partnership Opportunity - Dear [Recipient's Name], I hope this email finds you well. My name is [Your Name], and I am [Your Position] at [Your Company]. I am reaching out to explore a potential partnership where we can combine our strengths for mutual growth. Let me know a suitable time to discuss further. Warm regards, [Your Name].",
    "Subject: Let's Collaborate for a Mutual Marketing Success - Dear [Recipient's Name], I am [Your Name] from [Your Company], and I would like to propose a co-marketing initiative to enhance visibility for both our brands. Our combined efforts can deliver great value to our audience. Looking forward to your response. Best regards, [Your Name].",
    "Subject: Proposal for Collaborative Product Development - Dear [Recipient's Name], At [Your Company], we admire your work in [specific area]. We see an opportunity to collaborate on developing a product that blends our expertise for market success. Please let me know a convenient time to discuss this further. Kind regards, [Your Name].",
    "Subject: Partnering for a Greener Future - Dear [Recipient's Name], Sustainability is integral to [Your Company], and we see [Recipient's Company] as a perfect partner to amplify our efforts. Together, we can tackle [specific issue]. I would love to discuss our shared goals. Warm regards, [Your Name].",
    "Subject: Let's Innovate Together - Dear [Recipient's Name], Your advancements in [specific technology] inspire us, and we believe a partnership between [Your Company] and [Recipient's Company] could lead to groundbreaking innovations. Let’s connect to discuss this exciting opportunity. Best regards, [Your Name].",
    "Subject: Exploring a Cross-Promotion Partnership - Dear [Recipient's Name], I am reaching out to propose a cross-promotion opportunity between [Your Company] and [Recipient's Company]. By collaborating, we can engage broader audiences and drive mutual growth. Let’s discuss the possibilities. Kind regards, [Your Name].",
    "Subject: Partnership Opportunity for Regional Expansion - Dear [Recipient's Name], As [Your Company] plans to expand in [specific region], we believe [Recipient's Company] would be an ideal partner given your strong presence in the area. Let’s discuss this exciting opportunity. Best regards, [Your Name].",
    "Subject: Let’s Host a Joint Event! - Dear [Recipient's Name], I am [Your Name], [Your Position] at [Your Company]. I propose a partnership to host a joint event or webinar showcasing our expertise in [specific topic]. Together, we can create high-value content. Looking forward to your thoughts. Warm regards, [Your Name].",
    "Subject: Partnering to Optimize Supply Chain - Dear [Recipient's Name], I am writing to discuss a potential partnership with [Recipient's Company] to streamline and optimize our supply chain operations for mutual benefits. Could we meet to explore this? Kind regards, [Your Name].",
    "Subject: Join Us in Shaping the Future of [Industry/Field] - Dear [Recipient's Name], [Your Company] is embarking on an R&D initiative in [specific area] and would love to collaborate with [Recipient's Company] to achieve groundbreaking advancements. Please let me know your availability. Best regards, [Your Name]."
]

In [14]:
class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

In [15]:
# Assign eos_token as pad_token if pad_token is not defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Now tokenize the data with padding
inputs = tokenizer(data, return_tensors="pt", padding=True, truncation=True)
dataset2 = CustomDataset(inputs)
dataloader2 = DataLoader(dataset2, batch_size=1, shuffle=True)
optimizer2 = AdamW(model2.parameters(), lr=5e-5)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
epochs = 3
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model2.to(device)
scaler = GradScaler()
model2.train()
model2.gradient_checkpointing_enable()  # Enable gradient checkpointing

accumulation_steps = 8  # Gradient accumulation
for epoch in range(epochs):
    loop = tqdm(dataloader2, leave=True)
    optimizer2.zero_grad()
    for step, batch in enumerate(loop):
        batch = {k: v.to(device) for k, v in batch.items()}
        with autocast():  # Mixed precision training
            outputs = model2(**batch, labels=batch["input_ids"])
            loss2 = outputs.loss
            loss2 = loss2 / accumulation_steps  # Normalize loss for gradient accumulation
            scaler.scale(loss2).backward()

        if (step + 1) % accumulation_steps == 0 or step == len(dataloader2) - 1:
            scaler.step(optimizer2)
            scaler.update()
            optimizer2.zero_grad()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss2.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Epoch 0: 100%|██████████| 10/10 [00:15<00:00,  1.56s/it, loss=0.362]
Epoch 1: 100%|██████████| 10/10 [00:20<00:00,  2.00s/it, loss=0.105]
Epoch 2: 100%|██████████| 10/10 [00:20<00:00,  2.00s/it, loss=0.0405]


In [17]:
model2.save_pretrained("./fine-tuned-1.3b")
tokenizer.save_pretrained("./fine-tuned-1.3b")

('./fine-tuned-1.3b\\tokenizer_config.json',
 './fine-tuned-1.3b\\special_tokens_map.json',
 './fine-tuned-1.3b\\tokenizer.model',
 './fine-tuned-1.3b\\added_tokens.json',
 './fine-tuned-1.3b\\tokenizer.json')

In [18]:
# Specify your model's directory or name
model_name = "fine-tuned-1.3b"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
 

In [63]:
def gen_tex(userinput):
    # Tokenize the input text with truncation, padding, and max_length handling
    tokenized_input = tokenizer(
        userinput,
        return_tensors="pt",  # Output PyTorch tensors
        truncation=True,
        padding=True,
        max_length=512  # Limit to a reasonable max_length to prevent overgeneration
    )

    # Move input_ids and attention_mask to the same device as the model
    input_ids = tokenized_input['input_ids'].to(device)
    attention_mask = tokenized_input['attention_mask'].to(device)

    # Generate text using the model
    output = model.generate(
        input_ids=input_ids,               # Provide input_ids
        attention_mask=attention_mask,     # Optional but useful for padded sequences
        max_length=500,                    # Increase max_length for a longer output
        num_return_sequences=1,            # Generate one sequence
        no_repeat_ngram_size=2,            # Avoid repetition of phrases
        temperature=0.7,                   # Control randomness in the output
        top_k=50,                          # Top-k sampling for diverse output
        top_p=0.95,                        # Nucleus sampling to keep quality
        do_sample=True,                    # Enable sampling for more variety
        early_stopping=True                # Stop early when confident
    )

    # Decode the generated tokens into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Prevent the model from repeating the user input at the start of the generated text
    if generated_text.lower().startswith(userinput.lower()):
        # Strip the user input from the result to avoid repetition
        generated_text = generated_text[len(userinput):].strip()

    return generated_text


In [66]:
# User input handling
userinput = input("Enter your text: ").strip()

# Generate and print the output
print(gen_tex(userinput))

.
