In [1]:
import csv
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin
import time
import random

# Function to parse the HTML content using BeautifulSoup and extract links
def extract_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a', class_='u-clickable-card__link', href=True)
    return [link['href'] for link in links]

# Function to click the "See More" button using Selenium
def click_see_more(driver, max_clicks=5):
    click_count = 0
    try:
        while click_count < max_clicks:
            show_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//button[@class="show-more-button big-margin"]'))
            )
            driver.execute_script("arguments[0].click();", show_more_button)
            print("Clicked 'See More' button successfully")
            click_count += 1
            time.sleep(5)  # Wait for some time after clicking the button
    except Exception as e:
        print(f"Error: Could not find or click the 'See More' button: {e}")

# Function to fetch content and extract text from <p> tags
def fetch_content(base_url, url):
    try:
        full_url = urljoin(base_url, url)
        response = requests.get(full_url)
        response.raise_for_status()  # Raise exception for HTTP errors
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        paragraphs = soup.find_all('p')
        text = '\n'.join([p.get_text() for p in paragraphs])
        return text
    except Exception as e:
        print(f"Skipping {url}: {e}")
        return None

# Main function to scrape links
def scrape_links(url, max_clicks=5, min_score=7, max_score=10, append=False):
    # Initialize WebDriver
    driver = webdriver.Chrome()  # You may need to adjust this based on your WebDriver setup

    try:
        driver.get(url)
        time.sleep(2)  # Wait for page to load

        # Extract initial links
        html = driver.page_source
        links = extract_links(html)

        # Click "See More" button to get additional links
        click_see_more(driver, max_clicks)

        # Extract links again after clicking "See More"
        html = driver.page_source
        additional_links = extract_links(html)

        # Combine initial links with additional links
        links.extend(additional_links)

        # Open the CSV file in append mode if specified, otherwise in write mode
        file_mode = 'a' if append else 'w'
        with open(r'C:\Users\hp i7 11jin\Desktop\text_data.csv', file_mode, newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Text', 'Score']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if not append:
                writer.writeheader()  # Write header only if not appending

            for link in links:
                content = fetch_content(url, link)
                if content is not None:
                    # Generate a random score within the specified range
                    score = round(random.uniform(min_score, max_score), 2)
                    print(f"Text Score: {score}")
                    print("Text from", link)
                    print(content)
                    print("-" * 50)
                    writer.writerow({'Text': content, 'Score': score})

    finally:
        driver.quit()  # Close the WebDriver once done

# Example usage
url1 = "https://www.aljazeera.net/health/"
max_clicks1 = 7
min_score1 = 7
max_score1 = 10
scrape_links(url1, max_clicks1, min_score1, max_score1, append=False)  # First call, do not append

url2 = "https://www.aljazeera.net/"
max_clicks2 = 5
min_score2 = 1
max_score2 = 4
scrape_links(url2, max_clicks2, min_score2, max_score2, append=True)  # Second call, append to file


Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Text Score: 7.56
Text from /health/2024/5/23/%d8%a5%d8%a8%d8%b1%d8%a9-%d8%a7%d9%84%d8%b8%d9%87%d8%b1-%d8%ae%d9%84%d8%a7%d9%84-%d8%b9%d9%85%d9%84%d9%8a%d8%a9-%d8%a7%d9%84%d9%85%d8%ae%d8%a7%d8%b6-%d8%aa%d9%82%d9%84%d9%84-%d8%ae%d8%b7%d8%b1
أفادت دراسة بأن النساء اللائي خضعن لتخدير فوق الجافية -وهو ما يعرف بإبرة الظهر أو أبيديورال- أثناء عملية المخاض يواجهن خطرا أقل لحدوث مضاعفات شديدة وقت الولادة.
وأشار الباحثون إلى أهمية أن تكون إبرة الظهر متاحة على نطاق أوسع، مع تقديم المزيد من المعلومات لهؤلاء اللاتي يمكنهن الاستفادة منها، بحسب وكالة الأنباء البريطانية (بي إيه ميديا).
وشملت الدراسة -التي أجرتها جامعتا غلاسكو وبريستول- 567 ألفا و216 امرأة كن في المخاض بمستشفيات جهاز الصحة الوطني الأسكتلندي خلال الفترة من عام 2007 إلى 

In [2]:
import pandas as pd

# Define the file path
file_path = r'C:\Users\hp i7 11jin\Desktop\text_data.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Print the DataFrame
print(df.tail())


                                                  Text  Score
219  قال الخبير العسكري والإستراتيجي العقيد الركن ح...   1.83
220  "الجزائر هناك، اتبعوا الضوء"، يصرخ مسؤول تونسي...   1.90
221  كشفت وثيقة سرية لوزارة الخارجية الأميركية أن ا...   1.07
222  أعلن الجيش الإسرائيلي اليوم الجمعة أن قواته ان...   2.32
223  تخيل أن معلم شاورما (مع الاحترام والتقدير لإصح...   2.01


In [27]:
df

Unnamed: 0,Text,Score,Lemmatized_Text
0,أفادت دراسة بأن النساء اللائي خضعن لتخدير فوق ...,7.56,أفاد دراس نساء الاءي خضع تخدير جافيه عرف بابر ...
1,أكدت الولايات المتحدة رصد حالة ثانية من إنفلون...,8.69,أكد ولاية متحده رصد حال إنفلونزا طير بشر ثبت أ...
2,أقدم مجموعة من الأطباء على استخدام تقنية مبتكر...,7.84,أقدم مجموع طبيب استخدام تقني مبتكر عرف علاج نب...
3,"فاز برنامج ""دكتور معلومة"" للقطاع الرقمي في شبك...",7.83,إز برنامج دكتور معلوم قطاع رقمي شبك جزير إعلام...
4,"""تسمم الحمل"" (Eclampsia) هو حالة خطيرة تتعلق ب...",7.90,سم حمل إc lampsia حال خطير ضغط دم تطور أثناء ح...
...,...,...,...
219,قال الخبير العسكري والإستراتيجي العقيد الركن ح...,1.83,خبير عسكري إستراتيجي عقيد الركن حاتم كريم الفل...
220,"""الجزائر هناك، اتبعوا الضوء""، يصرخ مسؤول تونسي...",1.90,جزاءر اتبع ضوء صرخ مسءول تونسي مهاجر سودي مضيف...
221,كشفت وثيقة سرية لوزارة الخارجية الأميركية أن ا...,1.07,كشف أيق سري وزار خارجي أميركي ولاية متحده استع...
222,أعلن الجيش الإسرائيلي اليوم الجمعة أن قواته ان...,2.32,أعلن جيش إسراءيلي جمعه انتشل جث 3 أسري محتجز ق...


In [8]:
import csv
import re
import string
from nltk.tokenize import word_tokenize
import stanza
import pandas as pd

# Initialize Stanza pipeline for lemmatization
stanza.download('ar')
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma')

# Read the stop words file
with open(r'C:\Users\hp i7 11jin\Downloads\list.txt', 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

# Functions for text preprocessing
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_diacritics(text):
    arabic_diacritics = re.compile("""
                                     ّ    | # Tashdid
                                     َ    | # Fatha
                                     ً    | # Tanwin Fath
                                     ُ    | # Damma
                                     ٌ    | # Tanwin Damm
                                     ِ    | # Kasra
                                     ٍ    | # Tanwin Kasr
                                     ْ    | # Sukun
                                     ـ     # Tatwil/Kashida
                                 """, re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)

def remove_punctuations(text):
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

# Load the CSV file into a DataFrame
df = pd.read_csv(r'C:\Users\hp i7 11jin\Desktop\text_data.csv')

# Process each document in the DataFrame
lemmatized_texts = []
for index, row in df.iterrows():
    cleaned_text = row['Text']
    cleaned_text = normalize_arabic(cleaned_text)
    cleaned_text = remove_punctuations(cleaned_text)
    cleaned_text = remove_repeating_char(cleaned_text)
    
    # Lemmatization
    doc = nlp(cleaned_text)
    lemmatized_tokens = [word.lemma for sentence in doc.sentences for word in sentence.words]

    # Remove diacritics from lemmatized tokens
    lemmatized_tokens_without_diacritics = [remove_diacritics(token) for token in lemmatized_tokens]

    # Remove stop words
    lemmatized_tokens_without_stopwords = [word for word in lemmatized_tokens_without_diacritics if word not in stop_words]

    # Join tokens back to a single string
    lemmatized_text = ' '.join(lemmatized_tokens_without_stopwords)
    lemmatized_texts.append(lemmatized_text)
print("hi")
# Add the lemmatized text to the DataFrame
df['Lemmatized_Text'] = lemmatized_texts


print("Text preprocessing completed and updated in text_data.csv")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-24 13:38:19 INFO: Downloaded file to C:\Users\hp i7 11jin\stanza_resources\resources.json
2024-05-24 13:38:19 INFO: Downloading default packages for language: ar (Arabic) ...
2024-05-24 13:38:21 INFO: File exists: C:\Users\hp i7 11jin\stanza_resources\ar\default.zip
2024-05-24 13:38:30 INFO: Finished downloading models and saved to C:\Users\hp i7 11jin\stanza_resources
2024-05-24 13:38:30 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-24 13:38:31 INFO: Downloaded file to C:\Users\hp i7 11jin\stanza_resources\resources.json
2024-05-24 13:38:32 INFO: Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |

2024-05-24 13:38:32 INFO: Using device: cpu
2024-05-24 13:38:32 INFO: Loading: tokenize
2024-05-24 13:38:32 INFO: Loading: mwt
2024-05-24 13:38:32 INFO: Loading: pos
2024-05-24 13:38:33 INFO: Loading: lemma
2024-05-24 13:38:33 INFO: Done loading processors!


hi
Text preprocessing completed and updated in text_data.csv


In [13]:
# Prepare input and output data
texts = df['Lemmatized_Text'].astype(str).tolist()  # Convert to list of strings
labels = df['Score'].values  # Use the score as the label for regression

# Tokenization and padding
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=max_len)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

Data preprocessing completed.


In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, GRU, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define a function to build models
def build_model(model_type, input_length, vocab_size, embedding_dim, units):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
    if model_type == 'RNN':
        model.add(SimpleRNN(units))
    elif model_type == 'Bidirectional RNN':
        model.add(Bidirectional(SimpleRNN(units)))
    elif model_type == 'GRU':
        model.add(GRU(units))
    elif model_type == 'Bidirectional GRU':
        model.add(Bidirectional(GRU(units)))
    elif model_type == 'LSTM':
        model.add(LSTM(units))
    elif model_type == 'Bidirectional LSTM':
        model.add(Bidirectional(LSTM(units)))
    model.add(Dense(units, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))  # Output layer for regression
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

# Hyperparameters
embedding_dim = 100
units = 128
batch_size = 32
epochs = 10

# Create a dictionary to store models and their histories
models = {}
histories = {}

# List of model types to train
model_types = ['RNN', 'Bidirectional RNN', 'GRU', 'Bidirectional GRU', 'LSTM', 'Bidirectional LSTM']

# Train and evaluate each model
for model_type in model_types:
    print(f'Training {model_type} model...')
    model = build_model(model_type, max_len, max_words, embedding_dim, units)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit(x_train, y_train, validation_split=0.2, epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])
    
    # Store the model and its history
    models[model_type] = model
    histories[model_type] = history

    # Evaluate the model
    print(f'Evaluating {model_type} model...')
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'MSE for {model_type} model: {mse}')
    print(f'MAE for {model_type} model: {mae}')

Training RNN model...




Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 333ms/step - loss: 35.1757 - mean_absolute_error: 5.0966 - val_loss: 19.0686 - val_mean_absolute_error: 3.3005
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - loss: 19.0297 - mean_absolute_error: 3.4137 - val_loss: 8.8297 - val_mean_absolute_error: 2.8357
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 98ms/step - loss: 10.4276 - mean_absolute_error: 2.8303 - val_loss: 11.2055 - val_mean_absolute_error: 3.1087
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step - loss: 11.5227 - mean_absolute_error: 3.0695 - val_loss: 8.7345 - val_mean_absolute_error: 2.7138
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - loss: 11.0191 - mean_absolute_error: 2.9199 - val_loss: 8.7568 - val_mean_absolute_error: 2.8109
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - l



Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 453ms/step - loss: 35.0085 - mean_absolute_error: 5.0757 - val_loss: 12.7073 - val_mean_absolute_error: 2.5843
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step - loss: 12.2069 - mean_absolute_error: 2.9070 - val_loss: 11.1910 - val_mean_absolute_error: 3.1093
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 98ms/step - loss: 11.6170 - mean_absolute_error: 3.0514 - val_loss: 8.5545 - val_mean_absolute_error: 2.6547
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step - loss: 9.0514 - mean_absolute_error: 2.5705 - val_loss: 7.9340 - val_mean_absolute_error: 2.6179
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 114ms/step - loss: 7.6512 - mean_absolute_error: 2.3943 - val_loss: 9.2123 - val_mean_absolute_error: 2.6704
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 114ms/step - 



Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 461ms/step - loss: 34.4795 - mean_absolute_error: 5.0057 - val_loss: 29.4353 - val_mean_absolute_error: 4.5481
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 191ms/step - loss: 35.0763 - mean_absolute_error: 5.0591 - val_loss: 22.7040 - val_mean_absolute_error: 3.7278
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 170ms/step - loss: 24.8192 - mean_absolute_error: 4.0082 - val_loss: 10.2045 - val_mean_absolute_error: 2.7963
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 189ms/step - loss: 10.5451 - mean_absolute_error: 2.8185 - val_loss: 9.3156 - val_mean_absolute_error: 2.8924
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 189ms/step - loss: 9.4921 - mean_absolute_error: 2.8084 - val_loss: 8.3957 - val_mean_absolute_error: 2.6805
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 186ms/ste



Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 687ms/step - loss: 38.4155 - mean_absolute_error: 5.3626 - val_loss: 30.1634 - val_mean_absolute_error: 4.6279
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 187ms/step - loss: 35.6823 - mean_absolute_error: 5.1713 - val_loss: 24.5792 - val_mean_absolute_error: 3.9672
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 196ms/step - loss: 27.5366 - mean_absolute_error: 4.2382 - val_loss: 12.5482 - val_mean_absolute_error: 2.6359
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 216ms/step - loss: 12.6896 - mean_absolute_error: 2.9528 - val_loss: 10.9393 - val_mean_absolute_error: 3.1156
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 244ms/step - loss: 10.8018 - mean_absolute_error: 2.9416 - val_loss: 8.5853 - val_mean_absolute_error: 2.6098
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 248ms/s



[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 487ms/step - loss: 33.8352 - mean_absolute_error: 5.0417 - val_loss: 30.4957 - val_mean_absolute_error: 4.6616
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 193ms/step - loss: 34.8638 - mean_absolute_error: 5.0713 - val_loss: 18.8822 - val_mean_absolute_error: 3.2359
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 186ms/step - loss: 16.7335 - mean_absolute_error: 3.3712 - val_loss: 11.8541 - val_mean_absolute_error: 3.1557
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 190ms/step - loss: 11.6229 - mean_absolute_error: 2.9767 - val_loss: 8.7349 - val_mean_absolute_error: 2.7358
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 179ms/step - loss: 10.5300 - mean_absolute_error: 2.9213 - val_loss: 8.7065 - val_mean_absolute_error: 2.7520
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 175ms/step - loss: 



Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 632ms/step - loss: 37.5467 - mean_absolute_error: 5.3245 - val_loss: 29.7208 - val_mean_absolute_error: 4.5738
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 165ms/step - loss: 34.0367 - mean_absolute_error: 4.9500 - val_loss: 9.8998 - val_mean_absolute_error: 2.6957
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 197ms/step - loss: 11.4014 - mean_absolute_error: 2.7534 - val_loss: 8.6750 - val_mean_absolute_error: 2.7666
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 187ms/step - loss: 10.9288 - mean_absolute_error: 2.9180 - val_loss: 8.6682 - val_mean_absolute_error: 2.6343
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 188ms/step - loss: 9.0390 - mean_absolute_error: 2.6285 - val_loss: 8.5634 - val_mean_absolute_error: 2.7960
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 210ms/step 

In [25]:
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_words, 
                        output_dim=hp.Int('embedding_dim', min_value=50, max_value=300, step=50), 
                        input_length=max_len))
    
    model_type = hp.Choice('model_type', ['RNN', 'Bidirectional RNN', 'GRU', 'Bidirectional GRU', 'LSTM', 'Bidirectional LSTM'])
    units = hp.Int('units', min_value=32, max_value=256, step=32)
    
    if model_type == 'RNN':
        model.add(SimpleRNN(units))
    elif model_type == 'Bidirectional RNN':
        model.add(Bidirectional(SimpleRNN(units)))
    elif model_type == 'GRU':
        model.add(GRU(units))
    elif model_type == 'Bidirectional GRU':
        model.add(Bidirectional(GRU(units)))
    elif model_type == 'LSTM':
        model.add(LSTM(units))
    elif model_type == 'Bidirectional LSTM':
        model.add(Bidirectional(LSTM(units)))
    
    model.add(Dense(units, activation='relu'))
    model.add(Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(1))  # Output layer for regression
    
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

# Initialize the Keras Tuner
tuner = kt.Hyperband(
    build_model,
    objective='val_loss',
    max_epochs=10,
    factor=3,
    directory='my_dir',
    project_name='text_hyperparam_tuning'
)

# Set an early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Search for the best hyperparameters
tuner.search(x_train, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[early_stopping])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the best model and train it
model = tuner.hypermodel.build(best_hps)
history = model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Best Hyperparameters: {best_hps.values}')
print(f'MSE: {mse}')
print(f'MAE: {mae}')

Trial 30 Complete [00h 00m 09s]
val_loss: 7.8122992515563965

Best val_loss So Far: 1.000115156173706
Total elapsed time: 00h 04m 59s
Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 285ms/step - loss: 38.0875 - mean_absolute_error: 5.3644 - val_loss: 26.4047 - val_mean_absolute_error: 4.1834
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 128ms/step - loss: 22.0689 - mean_absolute_error: 3.8094 - val_loss: 14.7338 - val_mean_absolute_error: 3.3302
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 140ms/step - loss: 12.6338 - mean_absolute_error: 3.0639 - val_loss: 9.1600 - val_mean_absolute_error: 2.6126
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 133ms/step - loss: 10.0143 - mean_absolute_error: 2.6756 - val_loss: 8.6975 - val_mean_absolute_error: 2.8023
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 139ms/step - loss: 8.8982 - mean_absolute_error: 2.7612 - v

In [35]:
from nltk.translate.bleu_score import sentence_bleu
import nltk
nltk.download('punkt')

# Tokenize the texts (you might need to adjust this to fit your exact text structure)
generated_texts_tokenized = [nltk.word_tokenize(text) for text in generated_texts]
reference_texts_tokenized = [[nltk.word_tokenize(text[0])] for text in reference_texts]

# Calculate BLEU score for each generated text
bleu_scores = []
for gen_text_tokens, ref_text_tokens in zip(generated_texts_tokenized, reference_texts_tokenized):
    bleu_score = sentence_bleu([ref_text_tokens], gen_text_tokens)
    bleu_scores.append(bleu_score)

# Calculate average BLEU score
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU Score:", avg_bleu_score)


[nltk_data] Downloading package punkt to C:\Users\hp i7
[nltk_data]     11jin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TypeError: cannot use a string pattern on a bytes-like object