In [2]:
from pathlib import Path
import requests

In [3]:
class BingImageCrawler:
    """
    A simple web crawler to download images from Bing Image Search.
    """

    def __init__(self, output_dir="images", max_images=50, delay=1.0):
        """
        Initialize the crawler.

        Args:
            output_dir: Directory to save downloaded images
            max_images: Maximum number of images to download
            delay: Delay between requests in seconds (to be respectful)
        """
        self.output_dir = Path(output_dir)
        self.max_images = max_images
        self.delay = delay
        self.session = requests.Session()
        self.session.headers.update(
            {
                "User-Agent": (
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/91.0.4472.124 Safari/537.36"
                )
            }
        )

    def search_bing_images(self, query):
        """
        Search Bing for images and return image URLs.

        Args:
            query: Search query string

        Returns:
            List of image URLs
        """
        search_url = (
            f"https://www.bing.com/images/search?" f"q={query}&form=HDRSC3&first=1"
        )

        try:
            response = self.session.get(search_url, timeout=10)
            response.raise_for_status()
            with open("response.html", "w", encoding="utf-8") as f:
                f.write(response.text)
        except Exception as e:
            print(f"Error searching Bing: {e}")
            return []

In [4]:
cralwer = BingImageCrawler()

In [None]:
# cralwer.search_bing_images("elon musk")

In [5]:
from bs4 import BeautifulSoup
with open("response.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

In [6]:
image_urls = []

In [7]:
# Try to find JSON data with image URLs
scripts = soup.find_all("script")
scripts

[<script nonce="CZILAxNck2OhDPiJPq78Mm+KckEg07o1OjzdS2cGNUA=" type="text/javascript">//<![CDATA[
 
 si_ST=new Date
 
 //]]></script>,
 <script nonce="CZILAxNck2OhDPiJPq78Mm+KckEg07o1OjzdS2cGNUA=" type="text/javascript">//<![CDATA[
 _G={Region:"HK",Lang:"zh-HK",ST:(typeof si_ST!=='undefined'?si_ST:new Date),Mkt:"zh-HK",RevIpCC:"hk",RTL:false,Ver:"43",IG:"A463BA497B4F44C7819699A1CF9F80D0",EventID:"68fb8361f6304198b8d461bc837d3010",V:"images",P:"images",DA:"PUSE01",CID:"0CBE8DDCDA656E8A08E19B53DB6D6FF6",SUIH:"DRltaK-Fu2EARYdhXAJeeg",adc:"b_ad",EF:{cookss:1,bmcov:1,crossdomainfix:1,bmasynctrigger:1,bmasynctrigger3:1,getslctspt:1,newtabsloppyclick:1,chevroncheckmousemove:1,sharepreview:1,shareoutimage:1,sharefixreadnum:1,clickbackRSFlare:1,clickbackRSAfterOnP1:1,clickbackRSonTopW:1,clickbackRSonAdAlgo:1,clickbackAjaxRsFlare:1,sharepreviewthumbnailid:1,shareencodefix:1,chatskip2content:1,fablogfix:1,uaclickbackas:1,uaasnodisappear:1,clearuspreo:1,fixTypeToSearchIssueFlare:1},gpUrl:"\/fd\/ls\

In [11]:
for script in scripts:
    print(script)
    print('--------------------------------')
    print(script.string)
    print("murl" in script.string)
    break

<script nonce="CZILAxNck2OhDPiJPq78Mm+KckEg07o1OjzdS2cGNUA=" type="text/javascript">//<![CDATA[

si_ST=new Date

//]]></script>
--------------------------------
//<![CDATA[

si_ST=new Date

//]]>
False


In [12]:
import re
for script in scripts:
    if script.string and "murl" in script.string:
        # Extract URLs from JSON-like structures
        matches = re.findall(r'"murl":"([^"]+)"', script.string)
        print(matches)
        break

In [13]:
for script in scripts:
    if script.string and "mediaurl" in script.string.lower():
        matches = re.findall(
            r'"mediaurl":"([^"]+)"', script.string, re.IGNORECASE
        )
        print(matches)
        break

[]


In [14]:
for script in scripts:
    if script.string and "murl" in script.string:
        # Extract URLs from JSON-like structures
        matches = re.findall(r'"murl":"([^"]+)"', script.string)
        image_urls.extend(matches)

    if script.string and "mediaurl" in script.string.lower():
        matches = re.findall(
            r'"mediaurl":"([^"]+)"', script.string, re.IGNORECASE
        )
        image_urls.extend(matches)

In [15]:
image_urls

[]

In [16]:
soup.find_all("img", class_="mimg")

[<img alt="elon musk 的圖片結果" class="cimg mimg" height="212" src="https://tse4.mm.bing.net/th/id/OIP.5Ep_ATiFsxwY9uqiRXxIHwHaFj?w=282&amp;h=212&amp;c=7&amp;r=0&amp;o=7&amp;cb=12&amp;pid=1.7&amp;rm=3" style="background-color:#334998;color:#334998" width="282"/>,
 <img alt="elon musk 的圖片結果" class="mimg" height="212" src="https://tse4.mm.bing.net/th/id/OIP.W1mppOQtJdEn3w34SpCKcwHaFj?w=283&amp;h=212&amp;c=7&amp;r=0&amp;o=7&amp;cb=12&amp;pid=1.7&amp;rm=3" style="background-color:#360abe;color:#360abe" width="283"/>,
 <img alt="elon musk 的圖片結果" class="cimg mimg rms_img" height="212" id="emb44FCEB74" loading="lazy" src="https://tse3.mm.bing.net/th/id/OIP.tmmxsctQdTCbVCX4LGDIuQHaFj?w=281&amp;h=212&amp;c=7&amp;r=0&amp;o=7&amp;cb=12&amp;pid=1.7&amp;rm=3" style="background-color:#0c1670;color:#0c1670" width="281"/>,
 <img alt="elon musk 的圖片結果" class="mimg rms_img" height="190" id="emb462DF4ACD" loading="lazy" src="https://tse1.mm.bing.net/th/id/OIP.COO2XQb0DDrvq7HTxCC0PQHaLT?w=125&amp;h=190&amp;c=7

In [18]:
for img in soup.find_all("img", class_="mimg"):
    if img.get("src"):
        image_urls.append(img["src"])
        print(img["src"])
        break

https://tse4.mm.bing.net/th/id/OIP.5Ep_ATiFsxwY9uqiRXxIHwHaFj?w=282&h=212&c=7&r=0&o=7&cb=12&pid=1.7&rm=3


In [19]:
image_urls

['https://tse4.mm.bing.net/th/id/OIP.5Ep_ATiFsxwY9uqiRXxIHwHaFj?w=282&h=212&c=7&r=0&o=7&cb=12&pid=1.7&rm=3',
 'https://tse4.mm.bing.net/th/id/OIP.5Ep_ATiFsxwY9uqiRXxIHwHaFj?w=282&h=212&c=7&r=0&o=7&cb=12&pid=1.7&rm=3']

In [20]:
for img in soup.find_all("img", class_="mimg"):
    if img.get("data-src"):
        image_urls.append(img["data-src"])
        print(img["data-src"])
        break

https://tse3.mm.bing.net/th/id/OIP.uNR-U4_yv95Y9LQl_9ZrXgHaEK?w=272&h=180&c=7&r=0&o=7&cb=12&pid=1.7&rm=3


In [21]:
for img in soup.find_all("img", class_="mimg"):
    if img.get("src"):
        image_urls.append(img["src"])
    if img.get("data-src"):
        image_urls.append(img["data-src"])

In [23]:
len(image_urls)

35

In [24]:
soup.find_all("img")

[<img class="rms_img" height="18" id="sbi_b" role="none" src="https://r.bing.com/rp/f21jlSMmEDN43OaavcdaB-7Phq0.svg" width="18"/>,
 <img alt="關閉" class="clsbtnimg rms_img" height="16" src="https://r.bing.com/rp/fdVZU4ttbw8NDRm6H3I5BW3_vCo.svg" width="16"/>,
 <img alt="關閉" class="clsoptbtnimg rms_img" height="20" src="https://r.bing.com/rp/4L4QdyjTv0HYE2Ig2ol9eYoqxg8.svg" width="20"/>,
 <img class="infbtnimg rms_img" height="16" loading="lazy" role="presentation" src="https://r.bing.com/rp/Fsa_OI0AplCnVoXGca8ALOo0S0s.svg" width="16"/>,
 <img alt="拖放圖片到這裡" class="icon1 rms_img" flex-shrink="0" src="https://r.bing.com/rp/UusW4P7f5q9mXctgEuZhyIziufA.svg"/>,
 <img alt="拖放圖片到這裡" class="icon2 rms_img" flex-shrink="0" src="https://r.bing.com/rp/4A9D07RlW6M7gPmPQQydX3khDJc.svg"/>,
 <img class="rms_img" height="50" id="loadingimg" loading="lazy" role="presentation" src="https://r.bing.com/rp/OHA5u4HW70mLbGkOE41WRDXcYTY.svg" width="50"/>,
 <img alt="貼上影像連結以搜尋" class="psticon rms_img" height="16" 

In [25]:
for img in soup.find_all("img"):
    src = img.get("src") or img.get("data-src")
    if src and src.startswith("http"):
        if "logo" not in src.lower():
            exts = [".jpg", ".jpeg", ".png", ".webp"]
            if any(ext in src.lower() for ext in exts):
                print(src)
                image_urls.append(src)

https://r.bing.com/rp/ytiieusXgM2K8bLkEDP-AS1ePds.png


In [31]:
image_urls = list(set(image_urls))

In [32]:
len(image_urls)

33

In [33]:
from urllib.parse import urlparse
import os

In [35]:
url = image_urls[0]
url

'https://tse4.mm.bing.net/th/id/OIP.D_eaxuvNG3OmFZqek_1V2gHaE7?w=300&h=200&c=7&r=0&o=7&cb=12&pid=1.7&rm=3'

In [36]:
parsed = urlparse(url)

In [37]:
parsed

ParseResult(scheme='https', netloc='tse4.mm.bing.net', path='/th/id/OIP.D_eaxuvNG3OmFZqek_1V2gHaE7', params='', query='w=300&h=200&c=7&r=0&o=7&cb=12&pid=1.7&rm=3', fragment='')

In [38]:
parsed.path

'/th/id/OIP.D_eaxuvNG3OmFZqek_1V2gHaE7'

In [39]:
os.path.splitext(parsed.path)

('/th/id/OIP', '.D_eaxuvNG3OmFZqek_1V2gHaE7')

In [40]:
ext = os.path.splitext(parsed.path)[1]
ext

'.D_eaxuvNG3OmFZqek_1V2gHaE7'

In [41]:
valid_exts = [".jpg", ".jpeg", ".png", ".webp", ".bmp"]
if not ext or ext not in valid_exts:
    ext = ".jpg"
ext 

'.jpg'

In [42]:
query = "elon musk"

In [43]:
query.replace(' ', '_')

'elon_musk'

In [44]:
i  = 0

In [45]:
f"{query.replace(' ', '_')}_{i+1:03d}{ext}"

'elon_musk_001.jpg'

In [None]:
 for i, url in enumerate(image_urls[: 20]):
    # Determine file extension
    parsed = urlparse(url)
    ext = os.path.splitext(parsed.path)[1]
    valid_exts = [".jpg", ".jpeg", ".png", ".webp", ".bmp"]
    if not ext or ext not in valid_exts:
        ext = ".jpg"

    filename = f"{query.replace(' ', '_')}_{i+1:03d}{ext}"
    save_path = output_path / filename

    print(f"Downloading {i+1}/{len(image_urls)}: {filename}")