In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


In [None]:
!pip install torch transformers datasets accelerate peft


In [None]:
!pip install -q torch transformers datasets accelerate peft bitsandbytes


In [None]:
!pip install -q torch transformers datasets accelerate peft bitsandbytes


In [None]:
import torch
from transformers import GPT2Config, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader
import torch.optim as optim


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


In [None]:
import torch
import torch.optim as optim
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"


config = {
    "hidden_size": 512,  
    "num_attention_heads": 8,  
    "num_hidden_layers": 6,  
    "intermediate_size": 2048,  
    "vocab_size": 50257,  
    "max_position_embeddings": 1024
}


In [None]:

torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained("gpt2")  
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)  

print(f"Model Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")


In [None]:
import os
import torch


os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"


torch.cuda.empty_cache()


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Move model to device AFTER initialization
model = model.to(device)


In [None]:
sample_text = "Hello, this is a test."
encoded_input = tokenizer(sample_text, return_tensors="pt").to(device)

print(f"Input Shape: {encoded_input['input_ids'].shape}") 


with torch.no_grad():
    output = model(**encoded_input)
    print("Model Output Verified ✅")


In [None]:
tokenizer.pad_token = tokenizer.eos_token  


In [None]:
from transformers import AutoTokenizer

# Load GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")


tokenizer.pad_token = tokenizer.eos_token  


train_texts = ["Hello, how are you?", "This is a test sentence.", "Training Kine-3M is fun!"]

# Tokenize dataset with padding enabled
tokenized_train = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")

print("✅ Tokenization successful! Ready to train.")


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import DataLoader

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
# Load tokenizer and ensure padding token is set
tokenizer.pad_token = tokenizer.eos_token  

# Load model
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)


In [None]:
# Convert tokenized dataset to PyTorch format
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_texts):
        self.input_ids = tokenized_texts["input_ids"]
        self.attention_mask = tokenized_texts["attention_mask"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
            "labels": torch.tensor(self.input_ids[idx], dtype=torch.long),
        }

train_dataset = TextDataset(tokenized_train)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [None]:
from torch.optim import AdamW


optimizer = AdamW(model.parameters(), lr=5e-5)


scaler = torch.amp.GradScaler()


In [None]:

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_texts):
        self.input_ids = tokenized_texts["input_ids"]
        self.attention_mask = tokenized_texts["attention_mask"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx].clone().detach(),
            "attention_mask": self.attention_mask[idx].clone().detach(),
            "labels": self.input_ids[idx].clone().detach(),
        }

train_dataset = TextDataset(tokenized_train)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [None]:
epochs = 3
gradient_accumulation_steps = 4  # Adjust based on memory

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch_idx, batch in enumerate(train_dataloader):  # ✅ Add enumerate() here
        inputs = {key: val.to(device) for key, val in batch.items()}

        optimizer.zero_grad()

        with torch.amp.autocast(device_type="cuda"):  # Mixed precision training
            outputs = model(**inputs)
            loss = outputs.loss / gradient_accumulation_steps  # Gradient accumulation

        scaler.scale(loss).backward()

        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")

print("✅ Training Complete!")


In [None]:
model.save_pretrained("kine-3m-model")
tokenizer.save_pretrained("kine-3m-model")
print("✅ Model saved successfully!")


In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="kine-3m-model", tokenizer="kine-3m-model")

prompt = "Once upon a time"
generated_text = generator(prompt, max_length=50, do_sample=True)
print(generated_text[0]["generated_text"])


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer from the saved directory
model = AutoModelForCausalLM.from_pretrained("kine-3m-model")
tokenizer = AutoTokenizer.from_pretrained("kine-3m-model")

print("✅ Model loaded successfully!")


In [None]:
import os
print(os.listdir("/kaggle/working/kine-3m-model"))


In [None]:
import shutil
shutil.make_archive("/kaggle/working/kine-3m-model", 'zip', "/kaggle/working/kine-3m-model")


In [None]:
from IPython.display import FileLink
FileLink("/kaggle/working/kine-3m-model.zip")


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, LlamaConfig
import torch
import shutil


model = AutoModelForCausalLM.from_pretrained("kine-3m-model")
tokenizer = AutoTokenizer.from_pretrained("kine-3m-model")

# Create a new LLaMA model configuration
llama_config = LlamaConfig(
    vocab_size=model.config.vocab_size,
    hidden_size=model.config.n_embd,
    intermediate_size=model.config.n_inner if model.config.n_inner else 4 * model.config.n_embd,
    num_hidden_layers=model.config.n_layer,
    num_attention_heads=model.config.n_head,
    max_position_embeddings=model.config.n_positions,
    rms_norm_eps=1e-6,
    tie_word_embeddings=False
)

# Initialize LLaMA model
llama_model = LlamaForCausalLM(llama_config)

# Resize token embeddings if vocab size is different
llama_model.resize_token_embeddings(len(tokenizer))

# Improved weight mapping from GPT-2 to LLaMA
mapping = {
    "transformer.wte.weight": "model.embed_tokens.weight",
    "transformer.ln_f.weight": "model.norm.weight",
    "lm_head.weight": "lm_head.weight"
}

with torch.no_grad():
    for name, param in model.named_parameters():
        mapped_name = mapping.get(name, None)
        if mapped_name and mapped_name in llama_model.state_dict():
            if param.shape == llama_model.state_dict()[mapped_name].shape:
                llama_model.state_dict()[mapped_name].copy_(param)
            else:
                print(f"⚠️ Shape mismatch: {name} -> {mapped_name}, skipping...")


output_dir = "kine-3m-llama-model"
llama_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


shutil.make_archive(output_dir, 'zip', output_dir)

print("✅ Model successfully converted to LLaMA format!")

