## Import library

In [1]:
import io
from pathlib import Path
import hashlib
import pytube, os, re
import numpy as np
import pandas as pd
import requests
import sys
import time

from PIL import Image
from bs4 import BeautifulSoup
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.chrome import service
from webdriver_manager.opera import OperaDriverManager

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains

## Functions

In [2]:
def get_content_from_url(url):
    driver = webdriver.Edge(executable_path="msedgedriver.exe")    # Add "executable_path=" if the driver is in a custom directory.
    driver.get(url)
    for _ in range(500):
        driver.execute_script("window.scrollBy(0, 500);")
        time.sleep(7)
    page_content = driver.page_source
    driver.quit()  
    return page_content

In [3]:
def parse_image_urls(content, classes, location, source):
    soup = BeautifulSoup(content)
    results = []
    for a in soup.findAll(attrs={"class": classes}):
        name = a.find(location)
        if name not in results:
            results.append(name.get(source))
    return [elem for elem in results if elem]

In [4]:
def save_urls_to_csv(image_urls):
    df = pd.DataFrame({"links": image_urls})
    df.drop_duplicates(inplace=True)
    df.dropna(how='all', inplace=True)
    df.to_csv("links.csv", index=False, encoding="utf-8")

In [5]:
def get_and_save_image_to_file(image_url, output_dir, i):
    response = requests.get(image_url)
    image_content = response.content
    image_file = io.BytesIO(image_content)
    image = Image.open(image_file).convert("RGB")
    resized_image = image.resize((300, 300)) # Resize the image to 300x300
    filename = f"Category1_{i+1}.jpg"
    file_path = output_dir / filename
    resized_image.save(file_path)
    image.close()
    resized_image.close()

## Category 1

In [6]:
url = "https://unsplash.com/t/wallpapers"
content = get_content_from_url(url)
image_urls = parse_image_urls(content=content, 
                              classes="MorZF", 
                              location="img", 
                              source="srcset")

save_urls_to_csv(image_urls[:1000]) 
    
output_dir = Path("Category1")
for i, image_url in enumerate(image_urls[:1000]):
    get_and_save_image_to_file(image_url, output_dir, i)
    print(f"Downloaded image {i + 1} / 1000")

print("All images downloaded successfully!")

Downloaded image 1 / 1000
Downloaded image 2 / 1000
Downloaded image 3 / 1000
Downloaded image 4 / 1000
Downloaded image 5 / 1000
Downloaded image 6 / 1000
Downloaded image 7 / 1000
Downloaded image 8 / 1000
Downloaded image 9 / 1000
Downloaded image 10 / 1000
Downloaded image 11 / 1000
Downloaded image 12 / 1000
Downloaded image 13 / 1000
Downloaded image 14 / 1000
Downloaded image 15 / 1000
Downloaded image 16 / 1000
Downloaded image 17 / 1000
Downloaded image 18 / 1000
Downloaded image 19 / 1000
Downloaded image 20 / 1000
Downloaded image 21 / 1000
Downloaded image 22 / 1000
Downloaded image 23 / 1000
Downloaded image 24 / 1000
Downloaded image 25 / 1000
Downloaded image 26 / 1000
Downloaded image 27 / 1000
Downloaded image 28 / 1000
Downloaded image 29 / 1000
Downloaded image 30 / 1000
Downloaded image 31 / 1000
Downloaded image 32 / 1000
Downloaded image 33 / 1000
Downloaded image 34 / 1000
Downloaded image 35 / 1000
Downloaded image 36 / 1000
Downloaded image 37 / 1000
Downloaded

## Category 2

In [9]:
def save_urls_to_csv_2(image_urls):
    df = pd.DataFrame({"links": image_urls})
    df.drop_duplicates(inplace=True)
    df.dropna(how='all', inplace=True)
    df.to_csv("links_2.csv", index=False, encoding="utf-8")

In [7]:
def get_and_save_image_to_file_2(image_url, output_dir, i):
    response = requests.get(image_url)
    image_content = response.content
    image_file = io.BytesIO(image_content)
    image = Image.open(image_file).convert("RGB")
    resized_image = image.resize((300, 300)) # Resize the image to 300x300
    filename = f"Category2_{i+1}.jpg"
    file_path = output_dir / filename
    resized_image.save(file_path)
    image.close()
    resized_image.close()

In [8]:
url = "https://impossibleimages.ai/images/"
content = get_content_from_url(url)
image_urls = parse_image_urls(content=content, 
                              classes="image-listing-area", 
                              location="img", 
                              source="src")

save_urls_to_csv_2(image_urls[:1000]) 
    
output_dir = Path("Category2")
for i, image_url in enumerate(image_urls[:1000]):
    get_and_save_image_to_file_2(image_url, output_dir, i)
    print(f"Downloaded image {i + 1} / 1000")

print("All images downloaded successfully!")

Downloaded image 1 / 1000
Downloaded image 2 / 1000
Downloaded image 3 / 1000
Downloaded image 4 / 1000
Downloaded image 5 / 1000
Downloaded image 6 / 1000
Downloaded image 7 / 1000
Downloaded image 8 / 1000
Downloaded image 9 / 1000
Downloaded image 10 / 1000
Downloaded image 11 / 1000
Downloaded image 12 / 1000
Downloaded image 13 / 1000
Downloaded image 14 / 1000
Downloaded image 15 / 1000
Downloaded image 16 / 1000
Downloaded image 17 / 1000
Downloaded image 18 / 1000
Downloaded image 19 / 1000
Downloaded image 20 / 1000
Downloaded image 21 / 1000
Downloaded image 22 / 1000
Downloaded image 23 / 1000
Downloaded image 24 / 1000
Downloaded image 25 / 1000
Downloaded image 26 / 1000
Downloaded image 27 / 1000
Downloaded image 28 / 1000
Downloaded image 29 / 1000
Downloaded image 30 / 1000
Downloaded image 31 / 1000
Downloaded image 32 / 1000
Downloaded image 33 / 1000
Downloaded image 34 / 1000
Downloaded image 35 / 1000
Downloaded image 36 / 1000
Downloaded image 37 / 1000
Downloaded