In [None]:
from newspaper import Article
import requests
import os
from PIL import Image
from io import BytesIO

# URL of the article you want to download images fro
article_url = "https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw=laptop&_sacat=0&LH_TitleDesc=0&_osacat=0&_odkw=laptop"

# Parse the article
article = Article(article_url)
article.download()
article.parse()

# Create a directory to store images
os.makedirs("article_images", exist_ok=True)

# Download and save each image in the article
for i, img_url in enumerate(article.images):
    try:
        response = requests.get(img_url)
        if response.status_code == 200:
            # Check if the image format is supported and width is at least 500px
            img = Image.open(BytesIO(response.content))
            if img.format.lower() in ["jpeg", "jpg", "png"] and (img.width >= 200 or img.height >= 200):
                with open(f"article_images/image_{i}.{img.format.lower()}", "wb") as img_file:
                    img_file.write(response.content)
                print(f"Image {i} downloaded successfully.")
            else:
                print(f"Skipping image {i}: Unsupported format or width < 500px.")
        else:
            print(f"Error downloading image {i}: HTTP status code {response.status_code}")
    except Exception as e:
        print(f"Error downloading image {i}: {e}")



In [None]:
import hashlib
import io
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup
from PIL import Image
from selenium import webdriver
from selenium.webdriver import ChromeOptions


def get_content_from_url(url):
    options = ChromeOptions()
    options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)

    driver.get(url)
    page_content = driver.page_source
    driver.quit()
    return page_content


def parse_image_urls(content, classes, location, source):
    soup = BeautifulSoup(content, "html.parser")
    results = []
    for a in soup.findAll(attrs={"class": classes}):
        name = a.find(location)
        if name not in results:
            results.append(name.get(source))
    return results


def save_urls_to_csv(image_urls):
    df = pd.DataFrame({"links": image_urls})
    df.to_csv("links.csv", index=False, encoding="utf-8")


def get_and_save_image_to_file(image_url, output_dir):
    image_content = requests.get(image_url).content
    image_file = io.BytesIO(image_content)
    image = Image.open(image_file).convert("RGB")
    filename = hashlib.sha1(image_content).hexdigest()[:10] + ".png"
    file_path = os.path.join(output_dir, filename)
    image.save(file_path, "PNG", quality=80)


def main():
    url = "https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw=laptop&_sacat=0&LH_TitleDesc=0&_osacat=0&_odkw=laptop"
    content = get_content_from_url(url)
    image_urls = parse_image_urls(
        content=content, classes="s-item__image-wrapper image-treatment", location="img", source="src"
    )
    save_urls_to_csv(image_urls)

    for image_url in image_urls:
        folder_path = "article_images"
        os.makedirs(folder_path, exist_ok=True)
        get_and_save_image_to_file(
            image_url, output_dir=Path(folder_path)
        )


if __name__ == "__main__":
    main()
    print("Done!")