<a href="https://colab.research.google.com/github/samsiroos/SLM-Architecture/blob/main/%D8%A2%D9%85%D9%88%D8%B2%D8%B4_%D9%88_%D8%AA%D9%88%D9%84%DB%8C%D8%AF_%D9%85%D8%AA%D9%86_%D8%A8%D8%A7_GRU_%D8%AF%D8%B1_Google_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np
import os # For file system operations

print("Libraries successfully imported.")

# --- 1. Prepare the Training Data File ---
# Define the filename for the training data and the model
training_data_file = "training_data.txt"
model_filename = "my_gru_model.keras" # Using .keras extension for newer TensorFlow versions

# NOTE: The 'initial_training_text' variable has been removed as per your request.
# Please ensure that 'training_data.txt' exists in your Colab environment
# with the desired training content before running this script,
# especially if you are starting a new Colab session or the file was deleted.
# You can manually upload a 'training_data.txt' file or create it using Colab's file system.

# Read the text from the file
try:
    with open(training_data_file, "r", encoding="utf-8") as f:
        text = f.read()
    print(f"Text successfully loaded from '{training_data_file}'.")
    if not text.strip(): # Check if the file is empty after stripping whitespace
        print(f"Warning: The file '{training_data_file}' is empty. Please add content to it for training.")
        exit() # Exit if the file is empty
except FileNotFoundError:
    print(f"Error: The file '{training_data_file}' was not found. Please ensure it's in the correct path and contains training data.")
    print("You need to create this file manually or upload it to your Colab environment.")
    exit() # Exit if the file is not found

# Convert all text to lowercase for consistency
text = text.lower()
print(f"\nOriginal text for training (first part):\n{text[:500]}...\n") # Display a portion of the text

# --- Tokenization ---
# Tokenization is the process of converting text into a sequence of numbers.
# Each unique word receives a unique index.
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text]) # Train the tokenizer on the text to recognize words
word_index = tokenizer.word_index # Dictionary of words to their indices
print(f"Number of unique words in the vocabulary: {len(word_index)}\n")

# Total number of unique words + 1 (for words not found in the vocabulary or for padding)
total_words = len(word_index) + 1
print(f"Total unique words in vocabulary (including zero for padding): {total_words}\n")

# --- Create input and output sequences for training ---
# For training the model, we create sequences of words (n-grams).
input_sequences = []
for line in text.split('\n'):
    if line.strip() == "": # Skip empty lines
        continue
    token_list = tokenizer.texts_to_sequences([line])[0] # Convert each line to a sequence of numbers
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1] # n-gram sequence (from the beginning of the line up to the current word)
        input_sequences.append(n_gram_sequence)

print(f"Number of input sequences for training: {len(input_sequences)}\n")
print(f"Example input sequences (numbers):\n{input_sequences[:5]}\n")

# --- Pad sequences and separate X and Y ---
# Neural networks require inputs to be of uniform size.
# Therefore, shorter sequences are padded with zeros at the beginning (pre-padding).
max_sequence_len = max([len(x) for x in input_sequences])
print(f"Maximum sequence length in training data: {max_sequence_len}\n")

padded_sequences = np.array(tf.keras.preprocessing.sequence.pad_sequences(input_sequences,
                                                                          maxlen=max_sequence_len,
                                                                          padding='pre'))

# X (input): All words in the sequence except the last word
X = padded_sequences[:, :-1]
# labels (output): Only the last word in each sequence (the word we want to predict)
labels = padded_sequences[:, -1]

# Convert output (labels) to one-hot encoding format
# Since word prediction is a classification problem, outputs must be one-hot.
y = to_categorical(labels, num_classes=total_words)

print(f"Input dimensions (X) for the model: {X.shape}") # (Number of samples, input sequence length)
print(f"Output dimensions (y) for the model (one-hot): {y.shape}") # (Number of samples, total number of words in vocabulary)

# --- Check for existing model and handle training/loading ---
model = None
train_model = True # Default to train if no model exists or user wants to retrain

if os.path.exists(model_filename):
    print(f"\nFound existing model: '{model_filename}'.")
    # Using a simple input for user choice. In a real app, you'd use a UI element.
    user_choice = input("Do you want to retrain the model? (yes/no): ").lower()
    if user_choice == 'no':
        train_model = False
        try:
            model = load_model(model_filename)
            print(f"Model successfully loaded from '{model_filename}'.")
        except Exception as e:
            print(f"Error loading model: {e}. Will proceed with training a new model.")
            train_model = True # Fallback to training if loading fails
    else:
        print("Retraining requested by user.")
else:
    print(f"\nNo existing model found at '{model_filename}'. A new model will be trained.")

if train_model:
    # --- 2. Build the GRU Model ---
    # Define the GRU model architecture with updated parameters.
    model = Sequential()
    # Embedding layer: Increase word vector dimension to 256.
    # total_words: vocabulary size
    # 256: word vector dimension (embedding_dim)
    # input_length: input sequence length for the GRU layer (max_sequence_len - 1)
    model.add(Embedding(total_words, 256, input_length=max_sequence_len-1))
    # First GRU layer: 512 units.
    # return_sequences=True: Passes output of each timestep to the next GRU layer.
    model.add(GRU(512, return_sequences=True))
    # Second GRU layer: 512 units.
    # return_sequences=False: Passes only the output of the last timestep to the Dense layer.
    model.add(GRU(512, return_sequences=False))
    # Dense layer: Output layer with softmax activation.
    # total_words: number of neurons equals the total number of words in the vocabulary.
    model.add(Dense(total_words, activation='softmax'))

    # Compile the model: define loss function, optimizer, and evaluation metrics
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    # --- 3. Train the Model ---
    # Increase the number of epochs to 200.
    # This step can be time-consuming, especially without GPU.
    print("\nStarting GRU model training...")
    history = model.fit(X, y, epochs=200, verbose=1) # verbose=1 to display training details

    print("\nModel training finished.")
    print(f"Final training accuracy: {history.history['accuracy'][-1]:.4f}")
    print(f"Final training loss: {history.history['loss'][-1]:.4f}")

    # Save the trained model
    model.save(model_filename)
    print(f"Model successfully saved to '{model_filename}'.")
else:
    # If the model was loaded, display its summary
    if model:
        print("\nUsing the pre-existing model.")
        model.summary()
    else:
        print("Error: Model could not be loaded or built. Please check the setup.")
        exit()

# --- 4. Generate Text with the Trained Model ---
# Function to generate next words based on a seed text
def generate_text(seed_text, next_words, model, max_sequence_len, tokenizer):
    generated_text = seed_text
    for _ in range(next_words):
        # Convert seed text to numerical sequence
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        # Pad the sequence for model input
        token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list],
                                                                  maxlen=max_sequence_len-1,
                                                                  padding='pre')

        # Predict the probability of each word as the next word
        predicted_probs = model.predict(token_list, verbose=0)[0]
        # Select the word with the highest probability (argmax)
        predicted_word_index = np.argmax(predicted_probs)

        output_word = ""
        # Find the word corresponding to the predicted index
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break

        # If no word is found (e.g., index 0 for padding)
        if output_word == "":
            output_word = "<unk>" # Unknown word placeholder

        generated_text += " " + output_word
    return generated_text

# --- Interactive Text Generation ---
print("\n--- Interactive Text Generation ---")
continue_generating = 'yes'
while continue_generating.lower() == 'yes':
    seed_text_input = input("\nلطفاً یک عبارت اولیه برای تولید متن وارد کنید (به فارسی): ")
    num_words_to_generate = 20 # تعداد کلماتی که می‌خواهید مدل تولید کند

    print(f"\nشروع تولید متن با عبارت اولیه: '{seed_text_input}'")
    generated_sentence = generate_text(seed_text_input.lower(), num_words_to_generate, model, max_sequence_len, tokenizer)
    print(f"\nمتن تولید شده توسط مدل:\n{generated_sentence}")

    continue_generating = input("\nآیا می‌خواهید جمله جدیدی بسازید؟ (بله/خیر): ")
    if continue_generating.lower() == 'بله':
        continue_generating = 'yes'
    else:
        continue_generating = 'no'

print("\nپایان تولید متن.")


Libraries successfully imported.
Text successfully loaded from 'training_data.txt'.

Original text for training (first part):

جهان هستی پر از شگفتی‌هاست. هر ستاره در آسمان شب، داستانی ناگفته دارد. کهکشان‌ها با میلیاردها ستاره، منظومه‌های شمسی بی‌شمار را در خود جای داده‌اند. زمین، سیاره آبی ما، تنها گوشه‌ای از این عظمت بی‌کران است. حیات در آن به شکل‌های گوناگون جریان دارد. از کوچکترین میکروارگانیسم‌ها تا بزرگترین نهنگ‌ها. همه و همه بخشی از این چرخه شگفت‌انگیز هستند.

علم به ما کمک می‌کند تا این رازها را کشف کنیم. فیزیک، شیمی، زیست‌شناسی، و نجوم. هر کدام دریچه‌ای نو به سوی درک عمیق‌تر جهان می‌گشایند. اکتشافات علمی، مرزه...

Number of unique words in the vocabulary: 292

Total unique words in vocabulary (including zero for padding): 293

Number of input sequences for training: 1848

Example input sequences (numbers):
[[26, 61], [26, 61, 62], [26, 61, 62, 2], [26, 61, 62, 2, 63], [26, 61, 62, 2, 63, 12]]

Maximum sequence length in training data: 59

Input dimensions (X) for the model: (1


--- Interactive Text Generation ---

لطفاً یک عبارت اولیه برای تولید متن وارد کنید (به فارسی): میکروارگانیسم‌ها 

شروع تولید متن با عبارت اولیه: 'میکروارگانیسم‌ها '

متن تولید شده توسط مدل:
میکروارگانیسم‌ها  به ما کمک می‌کند تا این رازها را کشف کنیم فیزیک، شیمی، زیست‌شناسی، و نجوم هر کدام دریچه‌ای نو به

آیا می‌خواهید جمله جدیدی بسازید؟ (بله/خیر): بله

لطفاً یک عبارت اولیه برای تولید متن وارد کنید (به فارسی): فیزیک

شروع تولید متن با عبارت اولیه: 'فیزیک'

متن تولید شده توسط مدل:
فیزیک زیست ما در خطر است تغییرات اقلیمی، آلودگی هوا و آب از بین رفتن تنوع زیستی این‌ها چالش‌های بزرگی هستند
