In [None]:
from Libraries_Used import *
from Shared_Functions import *

In [None]:
kokotaru_article_links = read_from_file("kokotaru_article_urls_first_20.txt", 'r_b_line')

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

Dạng số 1

In [None]:
def fetch_ingredients_type1(url):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            #Title tag
            title_tag = soup.find("h1", class_="entry-title")
            title = title_tag.get_text().strip() if title_tag else "Unknown Title"
            
            #Find ingredients
            ingredient_div = soup.find("div", class_="wprm-recipe-ingredient-group")
            
            if ingredient_div:
                ingredients = ingredient_div.get_text(separator="\n").strip()
                return title, ingredients  
            else:
                return title, None 
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def get_all_ingredients_from_urls_type1(url_list):
    ingredients_dict = {}
    not_found_urls = []
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        with tqdm(total=len(url_list), desc="Page Loading", unit="page") as progress_bar:
            future_to_url = {executor.submit(fetch_ingredients_type1, url): url for url in url_list}
            
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                    if result:
                        title, ingredients = result
                        if ingredients:
                            ingredients_dict[title] = ingredients
                        else:
                            not_found_urls.append(url)
                    else:
                        not_found_urls.append(url)
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                
                progress_bar.update(1)
                time.sleep(2)
    
    return ingredients_dict, not_found_urls

In [None]:
# all_ingredients_type1, type_1_not_found = get_all_ingredients_from_urls_type1(kokotaru_article_links)
# print("Ingredients Dictionary:", all_ingredients_type1)
# print("Not Found URLs:", type_1_not_found)

Dạng số 2

In [None]:
def fetch_ingredients_type2(url):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            # Find title: <h1>, class "entry-title"
            title_tag = soup.find("h1", class_="entry-title")
            title = title_tag.get_text().strip() if title_tag else "Unknown Title"
            
            # Find ingreditents: <h2>, keyword: "tỉ lệ", "thành phần", hoặc "nguyên liệu" (regardless of case)
            ingredient_header = None
            for h2 in soup.find_all("h2", class_="has-vivid-red-color has-text-color wp-block-heading"):
                h2_text = h2.get_text().strip().lower()  # Lấy văn bản và chuyển thành chữ thường
                if any(keyword in h2_text for keyword in ["tỉ lệ", "thành phần", "nguyên liệu"]):
                    ingredient_header = h2
                    break
            
            ingredients = None  
            
            # Get Title
            if ingredient_header:
                ingredients_list = []
                
                # Get ingredient list
                ul_tag = ingredient_header.find_next("ul")
                
                if ul_tag:
                    for li in ul_tag.find_all("li"):
                        ingredients_list.append(li.get_text().strip())
                
                # List To string
                ingredients = "\n".join(ingredients_list) if ingredients_list else None
            
            if ingredients:
                return title, ingredients
            else:
                return title, None
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def get_all_ingredients_from_urls_type2(url_list):
    ingredients_dict = {}  # {"Title" : "Ingredients"}
    not_found_urls = []     # Urls_Ingredient_not_found
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        with tqdm(total=len(url_list), desc="Page Loading", unit="page") as progress_bar:
            future_to_url = {executor.submit(fetch_ingredients_type2, url): url for url in url_list}
            
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                    if result:
                        title, ingredients = result
                        if ingredients:
                            ingredients_dict[title] = ingredients
                        else:
                            not_found_urls.append(url)
                    else:
                        not_found_urls.append(url)
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                
                progress_bar.update(1)
                time.sleep(2)
    return ingredients_dict, not_found_urls

In [None]:
# all_ingredients_type2, not_found_type2 = get_all_ingredients_from_urls_type2(type_1_not_found)
# print("Ingredients Dictionary:", all_ingredients_type2) 
# print("Not Found URLs:", not_found_type2)

Dạng số 3

In [None]:
def fetch_ingredients_type3(url):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            

            title_tag = soup.find("h1", class_="entry-title")
            title = title_tag.get_text().strip() if title_tag else "Unknown Title"
            

            ingredient_header = soup.find("h3", string=lambda text: "Nguyên liệu" in text if text else False)
            
            ingredients = None  
            
            if ingredient_header:
                ingredients_list = []
                
                for sibling in ingredient_header.find_next_siblings():
                    if sibling.name == "h3":  # Stop when you encounter the next <h3> tag ("How to" section)
                        break
                    if sibling.name == "p" and "text-align" in sibling.get("style", ""):
                        ingredients_list.append(sibling.get_text().strip())
                
                ingredients = "\n".join(ingredients_list) if ingredients_list else None
            
            return title, ingredients
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def get_all_ingredients_from_urls_type3(url_list):
    ingredients_dict = {}
    not_found_urls = []
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        with tqdm(total=len(url_list), desc="Page Loading", unit="page") as progress_bar:
            future_to_url = {executor.submit(fetch_ingredients_type3, url): url for url in url_list}
            
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                    if result:
                        title, ingredients = result
                        if ingredients:
                            ingredients_dict[title] = ingredients
                        else:
                            not_found_urls.append(url)
                    else:
                        not_found_urls.append(url)
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                
                progress_bar.update(1)
                time.sleep(2)
    
    return ingredients_dict, not_found_urls

In [None]:
# all_ingredients_type3, not_found_type3 = get_all_ingredients_from_urls_type3(not_found_type2)
# print("Ingredients Dictionary:", all_ingredients_type3)
# print("Not Found URLs:", not_found_type3)

Dạng số 4

In [None]:
def fetch_ingredients_type4(url):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            title_tag = soup.find("h1", class_="entry-title")
            title = title_tag.get_text().strip() if title_tag else "Unknown Title"
            
            # Find paragraphs containing the keyword "Ingredients" with tags like <strong>, <span>, or <h3>
            ingredient_header = soup.find(lambda tag: tag.name in ["strong", "span", "h3"] and "nguyên liệu" in tag.get_text().lower())
            
            ingredients = None
            
            if ingredient_header:
                ingredients_list = []
                
                ul_tag = ingredient_header.find_next("ul")
                
                if ul_tag:
                    for li in ul_tag.find_all("li"):
                        ingredients_list.append(li.get_text().strip())
                
                if not ingredients_list:
                    for sibling in ingredient_header.find_next_siblings():
                        if sibling.name == "h3" or sibling.name == "strong":
                            break
                        if sibling.name == "p":
                            ingredients_list.append(sibling.get_text().strip())
                
                ingredients = "\n".join(ingredients_list) if ingredients_list else None
            
            return title, ingredients
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def get_all_ingredients_from_urls_type4(url_list):
    ingredients_dict = {}
    not_found_urls = []
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        with tqdm(total=len(url_list), desc="Page Loading", unit="page") as progress_bar:
            future_to_url = {executor.submit(fetch_ingredients_type4, url): url for url in url_list}
            
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                    if result:
                        title, ingredients = result
                        if ingredients:
                            ingredients_dict[title] = ingredients
                        else:
                            not_found_urls.append(url)
                    else:
                        not_found_urls.append(url)
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                
                progress_bar.update(1)
                time.sleep(2)
    
    return ingredients_dict, not_found_urls

In [None]:
# all_ingredients_type4, not_found_type4 = get_all_ingredients_from_urls_type4(not_found_type3)
# print("Ingredients Dictionary:", all_ingredients_type4)  # In ra dictionary tiêu đề và nguyên liệu
# print("Not Found URLs:", not_found_type4)  # In ra danh sách các URL không có nguyên liệu

In [None]:
def fetch_ingredients_type5(url):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            title_tag = soup.find("h1", class_="entry-title")
            title = title_tag.get_text().strip() if title_tag else "Unknown Title"
            
            # Find the <h4> tag containing the keyword "nguyên liệu"
            ingredient_header = soup.find(lambda tag: tag.name == "h4" and "nguyên liệu" in tag.get_text().lower())
            
            ingredients = None
            
            if ingredient_header:
                ingredients_list = []
                
                # Find all <p> tags following the <h4> tag until the next <h3> tag
                for sibling in ingredient_header.find_next_siblings():
                    if sibling.name == "h3":
                        break
                    if sibling.name == "p":
                        ingredients_list.append(sibling.get_text().strip())
                
                # Combine all ingredient lines into a single string
                ingredients = "\n".join(ingredients_list) if ingredients_list else None
            
            return title, ingredients
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def get_all_ingredients_from_urls_type5(url_list):
    ingredients_dict = {}
    not_found_urls = []
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        with tqdm(total=len(url_list), desc="Page Loading", unit="page") as progress_bar:
            future_to_url = {executor.submit(fetch_ingredients_type5, url): url for url in url_list}
            
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                    if result:
                        title, ingredients = result
                        if ingredients:
                            ingredients_dict[title] = ingredients
                        else:
                            not_found_urls.append(url)
                    else:
                        not_found_urls.append(url)
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                
                progress_bar.update(1)
                time.sleep(2)
    
    return ingredients_dict, not_found_urls


In [None]:
# all_ingredients_type5, not_found_type5 = get_all_ingredients_from_urls_type5(not_found_type4)
# print("Ingredients Dictionary:", all_ingredients_type5)
# print("Not Found URLs:", not_found_type5)

In [None]:
def fetch_ingredients_type6(url):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            title_tag = soup.find("h1", class_="entry-title")
            title = title_tag.get_text().strip() if title_tag else "Unknown Title"
            
            ingredient_start = soup.find(lambda tag: tag.name == "p" and "nguyên liệu" in tag.get_text().lower())
            
            ingredients = None
            
            if ingredient_start:
                ingredients_list = []
                
                for sibling in ingredient_start.find_next_siblings():
                    if sibling.name == "p" and "cách làm" in sibling.get_text().lower():
                        break

                    if sibling.name == "p" and sibling.get_text().strip():
                        ingredients_list.append(sibling.get_text().strip())
                
                ingredients = "\n".join(ingredients_list) if ingredients_list else None
            
            return title, ingredients
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def get_all_ingredients_from_urls_type6(url_list):
    ingredients_dict = {}
    not_found_urls = []
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        with tqdm(total=len(url_list), desc="Page Loading", unit="page") as progress_bar:
            future_to_url = {executor.submit(fetch_ingredients_type6, url): url for url in url_list}
            
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                    if result:
                        title, ingredients = result
                        if ingredients:
                            ingredients_dict[title] = ingredients
                        else:
                            not_found_urls.append(url)
                    else:
                        not_found_urls.append(url)
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                
                progress_bar.update(1)
                time.sleep(2)
    
    return ingredients_dict, not_found_urls

In [None]:
def fetch_ingredients_type7(url):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            title_tag = soup.find("h1", class_="entry-title")
            title = title_tag.get_text().strip() if title_tag else "Unknown Title"
            
            ingredient_header = soup.find(lambda tag: tag.name == "h3" and "nguyên liệu" in tag.get_text().lower())
            
            ingredients = None
            
            if ingredient_header:
                ingredients_list = []
                
                for sibling in ingredient_header.find_next_siblings():
                    if sibling.name == "h3" and "cách làm" in sibling.get_text().lower():
                        break
                    if sibling.name == "p" and sibling.get_text().strip():
                        ingredients_list.append(sibling.get_text().strip())
                
                ingredients = "\n".join(ingredients_list) if ingredients_list else None
            
            return title, ingredients
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def get_all_ingredients_from_urls_type7(url_list):
    ingredients_dict = {}
    not_found_urls = []
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        with tqdm(total=len(url_list), desc="Page Loading", unit="page") as progress_bar:
            future_to_url = {executor.submit(fetch_ingredients_type7, url): url for url in url_list}
            
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                    if result:
                        title, ingredients = result
                        if ingredients:
                            ingredients_dict[title] = ingredients
                        else:
                            not_found_urls.append(url)
                    else:
                        not_found_urls.append(url)
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                
                progress_bar.update(1)
                time.sleep(2)
    
    return ingredients_dict, not_found_urls

In [None]:
# u_list = ['https://kokotaru.com/blackberry-crumble-cheesecake-bars/']
# all_ingredients, not_found = get_all_ingredients_from_urls_type1(u_list)
# print("Ingredients Dictionary:", all_ingredients)
# print("Not Found URLs:", not_found)

---

In [None]:
def fetch_ingredients_with_selenium(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        time.sleep(3)
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        title_tag = soup.find("h1", class_="entry-title")
        title = title_tag.get_text().strip() if title_tag else "Unknown Title"
        
        ingredient_start = soup.find(lambda tag: tag.name == "p" and "nguyên liệu" in tag.get_text().lower())
        
        ingredients = None
        
        if ingredient_start:
            ingredients_list = []
            
            for sibling in ingredient_start.find_next_siblings():
                if sibling.name == "p" and "cách làm" in sibling.get_text().lower():
                    break
                if sibling.name == "p" and sibling.get_text().strip():
                    ingredients_list.append(sibling.get_text().strip())
            
            ingredients = "\n".join(ingredients_list) if ingredients_list else None
        
        return title, ingredients
    
    finally:
        driver.quit()

In [None]:
# url = "https://kokotaru.com/cach-lam-banh-kem-tra-xanh/"
# title, ingredients = fetch_ingredients_with_selenium(url)
# print("Title:", title)
# print("Ingredients:", ingredients)

In [None]:
def save_html_for_diagnostics(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            with open("diagnostics.html", "w", encoding="utf-8") as file:
                file.write(response.text)
            print("Đã lưu HTML vào diagnostics.html để kiểm tra.")
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")


In [None]:
# save_html_for_diagnostics("https://kokotaru.com/cach-lam-banh-kem-tra-xanh/")

---

In [None]:
def fetch_ingredients_and_title(url):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            title_tag = soup.find("div", class_="post-header").find("h1")
            title = title_tag.get_text().strip() if title_tag else "Unknown Title"
            
            ingredient_tags = soup.find_all("span", class_="ingredient-item__title")
            ingredients = '***'.join(tag.get_text().strip() for tag in ingredient_tags)
            
            return title, ingredients
        else:
            return None, None
    except requests.exceptions.RequestException:
        return None, None

def fetch_ingredients_from_multiple_urls(url_list):
    all_ingredients = {}
    failed_urls = []
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        with tqdm(total=len(url_list), desc="Fetching Ingredients", unit="article") as progress_bar:
            future_to_url = {executor.submit(fetch_ingredients_and_title, url): url for url in url_list}
            
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                title, ingredients = future.result()
                
                if title and ingredients:
                    all_ingredients[title] = ingredients
                else:
                    failed_urls.append(url)
                
                progress_bar.update(1)
                time.sleep(3)
    
    print(f"\nInitial fetch completed. Failed URLs: {len(failed_urls)}")
    
    if failed_urls:
        print("\nRetrying failed URLs (1st attempt)...")
        retry_1_failed_urls = []
        
        with tqdm(total=len(failed_urls), desc="Retrying Failed URLs (1st attempt)", unit="article") as retry_progress_bar:
            for url in failed_urls:
                title, ingredients = fetch_ingredients_and_title(url)
                
                if title and ingredients:
                    all_ingredients[title] = ingredients
                else:
                    retry_1_failed_urls.append(url)
                
                retry_progress_bar.update(1)
                time.sleep(3)
        
        print(f"\n1st retry completed. Failed URLs after 1st retry: {len(retry_1_failed_urls)}")
    
    if retry_1_failed_urls:
        print("\nRetrying failed URLs (2nd attempt)...")
        retry_2_failed_urls = []
        
        with tqdm(total=len(retry_1_failed_urls), desc="Retrying Failed URLs (2nd attempt)", unit="article") as retry_progress_bar:
            for url in retry_1_failed_urls:
                title, ingredients = fetch_ingredients_and_title(url)
                
                if title and ingredients:
                    all_ingredients[title] = ingredients
                else:
                    retry_2_failed_urls.append(url)
                
                retry_progress_bar.update(1)
                time.sleep(3)
        
        print(f"\n2nd retry completed. Final failed URLs: {len(retry_2_failed_urls)}")
    else:
        retry_2_failed_urls = []

    print("\nFetching Completed")
    print(f"Total successful pages: {len(all_ingredients)}")
    print(f"Failed URLs after all retries: {len(retry_2_failed_urls)}")
    
    if retry_2_failed_urls:
        print("\nFinal Failed URL List:")
        for failed_url in retry_2_failed_urls:
            print(f" - {failed_url}")

    return all_ingredients, retry_2_failed_urls

In [None]:
url_list = read_from_file('kitchenart_article_urls_first_25.txt', 'r_b_line')
all_ingredients, failed_urls = fetch_ingredients_from_multiple_urls(url_list)

In [None]:
for url, ingredients in all_ingredients.items():
    print(f"\nIngredients for {url}:")
    print(ingredients)