In [2]:
import importlib
from src.architecture import configuration_fattah, modeling_fattah
# Reload the modules
importlib.reload(configuration_fattah)
importlib.reload(modeling_fattah)
from src.architecture.configuration_fattah import fattahConfig
from src.architecture.modeling_fattah import fattahModel
configuration = fattahConfig()
model = fattahModel(configuration)
configuration

fattahConfig {
  "_attn_implementation_autoset": true,
  "activation_function": "silu",
  "attention_bias": false,
  "attention_dropout": 0.0,
  "aux_loss_coef": 0.01,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_size": 576,
  "initializer_range": 0.01,
  "input_bits": 8,
  "intermediate_size": 1536,
  "kv_channels": 128,
  "max_position_embeddings": 2048,
  "model_type": "fattah",
  "num_attention_heads": 10,
  "num_experts_per_tok": 2,
  "num_hidden_layers": 6,
  "num_key_value_heads": 5,
  "num_local_experts": 6,
  "output_router_logits": false,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 41447,
  "weight_bits": 1
}

In [3]:
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("total_params: ",total_params/1000000,"\nTrainable parameters:", trainable_params/1000000)

total_params:  150.446592 
Trainable parameters: 150.446592


In [22]:
del model

In [4]:
(sum(p.numel() for p in model.layers[0].mlp.parameters())/1000000)

15.92928

In [7]:
(sum(p.numel() for p in model.layers[0].self_attention.experts.parameters())/1000000  ) ,(sum(p.numel() for p in model.layers[0].self_attention.experts.input_linear.parameters())/1000000 +sum(p.numel() for p in model.layers[0].self_attention.experts.output_linear.parameters())/1000000  +sum(p.numel() for p in model.layers[0].self_attention.experts.router.parameters())/1000000 )


(2.362176, 2.3616)

In [8]:
embedding_layer = sum(p.numel() for p in model.embed_tokens.parameters()) # vocab * hidden_size

atten_kvproj = sum(p.numel() for p in model.layers[0].self_attention.parameters()) /1000000 - sum(p.numel() for p in model.layers[0].self_attention.experts.parameters())/1000000
atten_kvproj,sum(p.numel() for p in model.layers[0].self_attention.experts.parameters())/1000000 

(0.5898240000000001, 2.362176)

In [31]:
model.state_dict().keys()

odict_keys(['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.self_attention.experts.bias', 'layers.0.self_attention.experts.input_linear.weight', 'layers.0.self_attention.experts.output_linear.weight', 'layers.0.self_attention.experts.router.layer.weight', 'layers.0.self_attention.kv_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.mlp.bias', 'layers.0.mlp.input_linear.weight', 'layers.0.mlp.output_linear.weight', 'layers.0.mlp.router.layer.weight', 'layers.1.input_layernorm.weight', 'layers.1.self_attention.experts.bias', 'layers.1.self_attention.experts.input_linear.weight', 'layers.1.self_attention.experts.output_linear.weight', 'layers.1.self_attention.experts.router.layer.weight', 'layers.1.self_attention.kv_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.mlp.bias', 'layers.1.mlp.input_linear.weight', 'layers.1.mlp.output_linear.weight', 'layers.1.mlp.router.layer.weight', 'layers.2.input_layernorm.weight', 'layers.2.self_a

In [7]:
model.save_pretrained('model',safe_serialization=False)

In [5]:
from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained(r"src\tokenizer\llama_tokenizer")
tokenizer.pad_token = tokenizer.eos_token

sample_sentence = "what is your name?"
tokens = tokenizer(
                sample_sentence, truncation=True,
                padding='max_length', max_length=16)
print(f"Original Sentence: {sample_sentence}\nTokenized Sentence: {tokens}")

Original Sentence: what is your name?
Tokenized Sentence: {'input_ids': [1, 825, 338, 596, 1024, 29973, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


## Train model

In [None]:
from transformers import (
    Trainer, TrainingArguments, DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from datasets import load_dataset
import logging

logging.basicConfig(level=logging.INFO)

def create_tokenized_dataset_splits(path, tokenizer, block_size):
    dataset = load_dataset('text', data_files=path)
    shuffled_dataset = dataset['train'].shuffle(seed=42)
    split_datasets = shuffled_dataset.train_test_split(test_size=0.2)

    def tokenize_dataset(dataset):
        return dataset.map(
            lambda examples: tokenizer(
                examples['text'], truncation=True,
                padding='max_length', max_length=block_size
            ),
            batched=True
        )

    def unique_name_set(dataset):
      names_set = set()

      for example in dataset:
          name = example['text'].split(".")[0]
          names_set.add(name)

      return names_set

    return tokenize_dataset(split_datasets['train']), tokenize_dataset(split_datasets['test']), unique_name_set(split_datasets['train'])

def train_model(model, tokenizer, train_dataset, test_dataset, out_folder_path):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=out_folder_path,
        overwrite_output_dir=True,
        num_train_epochs=100,
        per_device_train_batch_size=8,
        save_steps=10000,
        logging_steps=10,
        eval_steps=1000,
        logging_dir=f'{out_folder_path}/logs',
        evaluation_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)]
    )

    trainer.train()
    model.save_pretrained(out_folder_path)



In [None]:

model = create_config_model(out_folder_path)
train_dataset, test_dataset, unique_names = create_tokenized_dataset_splits('/content/drive/MyDrive/LLM_Projects/llama2_project/dataset/pt_data_txt/custom/pt_alpaca_in_text.txt', tokenizer, block_size=32)
train_model(model, tokenizer, train_dataset, test_dataset, out_folder_path)

print("Training completed.")

In [None]:
It looks like I started in the middle of a conversation. Let's start fresh!**Welcome to our conversation!**I'm Athena2, your friendly AI assistant. I'd love to chat with you about any topic that interests you - from blockchain and AI to machine learning and data science. Feel free to share what's on your mind, and I'll do my best to break down complex ideas into fun, relatable conversations.**To get us started, would you like to:**1. **Explore a specific tech topic** (e.g., "What is the impact of AI in healthcare?").2. **Discuss a broader theme** (e.g., "The future of decentralized systems").3. **Just geek out about tech trends** (no specific question needed - we can just enjoy the conversation!).4. **Something else** (please share, and I'll do my best to accommodate).Please respond with one of the numbers above (1,2,3, or "Something else"), or feel free to ask a question right away!
"It looks like I started in the middle of a conversation. Let's start fresh!**Welcome to our conversation!**I'm Athena2, your friendly AI assistant. I'd love to chat with you about any topic that interests you - from blockchain and AI to machine learning and data science. Feel free to share what's on your mind, and I'll do my best to break down complex ideas into fun, relatable conversations.**To get us started, would you like to:**1. **Explore a specific tech topic** (e.g., \"What is the impact of AI in healthcare?\").2. **Discuss a broader theme** (e.g., \"The future of decentralized systems\").3. **Just geek out about tech trends** (no specific question needed - we can just enjoy the conversation!).4. **Something else** (please share, and I'll do my best to accommodate).Please respond with one of the numbers above (1,2,3, or \"Something else\"), or feel free to ask a question right away!"
