In [2]:
! pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, click, nltk
Successfully installed click-8.1.8 joblib-1.4.2 nltk-3.9.1


In [2]:
! pip install datasets

Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.18-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Using cached aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting attrs>=17.3.0

In [None]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector, Input
from tensorflow.keras.callbacks import EarlyStopping

# دانلود و نصب nltk punkt برای تقسیم بندی جملات
nltk.download('punkt')

# بارگذاری مجموعه داده CNN/Daily Mail
dataset = load_dataset("cnn_dailymail", "3.0.0")

# انتخاب زیرمجموعه آموزش و تست
train_data = dataset['train']
test_data = dataset['test']

# استخراج مقالات و خلاصه‌ها
X_train = [item['article'] for item in train_data]
y_train = [item['highlights'] for item in train_data]
X_test = [item['article'] for item in test_data]
y_test = [item['highlights'] for item in test_data]

# توکن‌سازی
tokenizer_article = Tokenizer()
tokenizer_article.fit_on_texts(X_train)
X_train_seq = tokenizer_article.texts_to_sequences(X_train)
X_test_seq = tokenizer_article.texts_to_sequences(X_test)

tokenizer_summary = Tokenizer()
tokenizer_summary.fit_on_texts(y_train)
y_train_seq = tokenizer_summary.texts_to_sequences(y_train)
y_test_seq = tokenizer_summary.texts_to_sequences(y_test)

# تنظیم حداکثر طول جملات
max_len_article = max(len(seq) for seq in X_train_seq)
max_len_summary = max(len(seq) for seq in y_train_seq)

# پدینگ توالی‌ها
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len_article, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len_article, padding='post')
y_train_pad = pad_sequences(y_train_seq, maxlen=max_len_summary, padding='post')
y_test_pad = pad_sequences(y_test_seq, maxlen=max_len_summary, padding='post')

# ساخت مدل
vocab_size_article = len(tokenizer_article.word_index) + 1
vocab_size_summary = len(tokenizer_summary.word_index) + 1

model = Sequential()
model.add(Input(shape=(max_len_article,)))
model.add(Embedding(input_dim=vocab_size_article, output_dim=100))
model.add(LSTM(100))
model.add(RepeatVector(max_len_summary))
model.add(LSTM(100, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size_summary, activation='softmax')))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# آموزش مدل
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_pad, np.expand_dims(y_train_pad, -1), epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# ترسیم نمودار دقت و از دست دادن
plt.figure(figsize=(12, 4))

# دقت
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='دقت آموزش')
plt.plot(history.history['val_accuracy'], label='دقت اعتبارسنجی')
plt.title('دقت مدل')
plt.xlabel('دوره')
plt.ylabel('دقت')
plt.legend()

# از دست دادن
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='از دست دادن آموزش')
plt.plot(history.history['val_loss'], label='از دست دادن اعتبارسنجی')
plt.title('از دست دادن مدل')
plt.xlabel('دوره')
plt.ylabel('از دست دادن')
plt.legend()

plt.tight_layout()
plt.show()

# تولید خلاصه
def generate_summary(article):
    seq = tokenizer_article.texts_to_sequences([article])
    padded = pad_sequences(seq, maxlen=max_len_article, padding='post')
    prediction = model.predict(padded)
    predicted_summary = np.argmax(prediction, axis=-1)
    return ' '.join(tokenizer_summary.index_word[i] for i in predicted_summary[0] if i != 0)

# تست تولید خلاصه
for article in X_test[:5]:  # فقط برای 5 مقاله تست
    print("Article:", article)
    print("Generated Summary:", generate_summary(article))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from deap import base, creator, tools, algorithms
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input

# فرض بر این است که داده‌ها و پیش‌پردازش مشابه کد قبلی هستند

# تعریف تابع برای ایجاد مدل
def create_model(embedding_dim, lstm_units):
    model = Sequential()
    model.add(Input(shape=(max_len_article,)))
    model.add(Embedding(input_dim=vocab_size_article, output_dim=embedding_dim))
    model.add(LSTM(lstm_units))
    model.add(Dense(vocab_size_summary, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# تابع برای ارزیابی عملکرد مدل
def evaluate_model(individual):
    embedding_dim, lstm_units = individual
    model = create_model(embedding_dim, lstm_units)
    model.fit(X_train_pad, np.expand_dims(y_train_pad, -1), epochs=1, batch_size=64, verbose=0)
    predictions = model.predict(X_test_pad)
    predicted_classes = np.argmax(predictions, axis=-1)
    accuracy = accuracy_score(y_test_pad, predicted_classes)
    return accuracy,

# تنظیم DEAP
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("embedding_dim", np.random.randint, 50, 200)  # Range for embedding dimension
toolbox.register("lstm_units", np.random.randint, 50, 200)  # Range for LSTM units
toolbox.register("individual", tools.initCycle, creator.Individual, (toolbox.embedding_dim, toolbox.lstm_units), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate_model)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# اجرای الگوریتم ژنتیک
population = toolbox.population(n=10)
ngen = 5

# ذخیره دقت هر نسل
accuracy_per_generation = []

for gen in range(ngen):
    # انتخاب
    offspring = toolbox.select(population, len(population))
    offspring = list(map(toolbox.clone, offspring))

    # ترکیب و جهش
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if np.random.rand() < 0.5:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if np.random.rand() < 0.2:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    # ارزیابی افراد جدید
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    # جمع آوری دقت هر نسل
    fits = [ind.fitness.values[0] for ind in population]
    accuracy_per_generation.append(np.mean(fits))

    # جایگزینی جمعیت
    population[:] = offspring

# بهترین فرد
fits = [ind.fitness.values[0] for ind in population]
best_ind = population[np.argmax(fits)]
print("Best individual is: ", best_ind)

# ترسیم دقت هر نسل
plt.figure(figsize=(10, 5))
plt.plot(range(1, ngen + 1), accuracy_per_generation, marker='o')
plt.title('Average Accuracy per Generation')
plt.xlabel('Generation')
plt.ylabel('Average Accuracy')
plt.grid()
plt.xticks(range(1, ngen + 1))
plt.show()