In [4]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# this notebook is a simple script to scrape images
# first we will get images from the website of US Army
# then we will get images from adobe
def get_soup(url):
    """
    this is a helper function that takes an url and returns a soup object
    :param url: the url of the page we want to scrape
    :return: a soup object 
    """
    res = requests.get(url) # get the html of the page
    res.raise_for_status() 
    soup = BeautifulSoup(res.text, "html.parser") # parse the html
    return soup


In [48]:
# now we will scrape images from https://www.army.mil/yearinphotos/
# this is the official website of US Army and there's gallery of images for each year that are licensed under public domain
# we will save these images in a folder called army_year_in_photos under images folder
url = "https://www.army.mil/yearinphotos/"
# we will get pictures from years 2016 to 2023 
years = [str(year) for year in range(2016, 2024)]
import time
print("Scraping images from https://www.army.mil/yearinphotos/...")
os.makedirs("images/army_year_in_photos", exist_ok=True)
count = 0
for year in years:
    soup = get_soup(url + year)
    images = soup.select("img[src]") # select all img tags that have src attribute
    for i, image in enumerate(images): # loop through all images and save them
        image_url = urljoin(url, image["src"])
        image_res = requests.get(image_url)
        image_res.raise_for_status() # raise an exception if the image url is not valid
        image_file = open(f"images/army_year_in_photos/army_year_in_photos{year}{i}.jpg", "wb")
        for chunk in image_res.iter_content(100000): 
            image_file.write(chunk)
        image_file.close()
    print(f"Done scraping images from {year}!")

print("Done scraping images!")

Scraping images from https://www.army.mil/yearinphotos/...
Done scraping images from 2016!
Done scraping images from 2017!
Done scraping images from 2018!
Done scraping images from 2019!
Done scraping images from 2020!
Done scraping images from 2021!
Done scraping images from 2022!
Done scraping images from 2023!
Done scraping images!


In [1]:

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# we will scrape images about military, war, army, guns, tanks, aircrafts, soldier
# from popular free stock photos website unsplash.com, 
# this script will download 100 images about every topic and save them in a folder with the same name under images folder

def get_soup(url):
    """
    this is a helper function that takes an url and returns a soup object
    :param url: the url of the page we want to scrape
    :return: a soup object 
    """
    res = requests.get(url)
    res.raise_for_status()
    soup = BeautifulSoup(res.text, "html.parser")
    return soup

In [7]:
# let's scrape images from adobe stock images
# https://stock.adobe.com/search?k={keyword}
# we will save these images in a folder called adobe_stock under images folder
import os

# these keywords are derived with the help of WordsAPI
# these are hot words(related) words to military and war
keywords = ["military", "war", "army", "gun", "tank", "aircraft", "soldier", "russian army",
             "us navy", "us army", "us air force", "us marines", "us coast guard", 
             "us special forces", "us navy seals", "chinese army", "chinese navy", "chinese air force",
             "chinese marines", "chinese special forces", "russian army", "russian navy", 
             "russian air force", "russian marines", "russian special forces",
             "turkish army", "turkish navy", "turkish air force", "turkish marines", "turkish special forces"]
print("Scraping images from https://stock.adobe.com...")
os.makedirs("images/adobe_stock", exist_ok=True)
# for every keyword we will scrape 30 images
for keyword in keywords:
    print(f"Scraping images about {keyword}...")
    url = f"https://stock.adobe.com/search?k={keyword}"
    soup = get_soup(url)
    images = soup.select("img[src]") # select all img tags that have src attribute
    for i, image in enumerate(images): # loop through all images and save them
        if i == 30:
            break
        image_url = urljoin(url, image["src"])
        image_res = requests.get(image_url)
        image_res.raise_for_status() # raise an exception if the image url is not valid
        image_file = open(f"images/adobe_stock/{keyword}{i}.jpg", "wb")
        for chunk in image_res.iter_content(100000): 
            image_file.write(chunk)
        image_file.close()
    print(f"Done scraping images about {keyword}!")

Scraping images from https://stock.adobe.com...
Scraping images about military...
Done scraping images about military!
Scraping images about war...
Done scraping images about war!
Scraping images about army...
Done scraping images about army!
Scraping images about gun...
Done scraping images about gun!
Scraping images about tank...
Done scraping images about tank!
Scraping images about aircraft...
Done scraping images about aircraft!
Scraping images about soldier...
Done scraping images about soldier!
Scraping images about russian army...
Done scraping images about russian army!
Scraping images about us navy...
Done scraping images about us navy!
Scraping images about us army...
Done scraping images about us army!
Scraping images about us air force...
Done scraping images about us air force!
Scraping images about us marines...
Done scraping images about us marines!
Scraping images about us coast guard...
Done scraping images about us coast guard!
Scraping images about us special forces

In [8]:
# same approach with different keywords
# we added more keywords to get more images
keywords2 = ["rifle", "machine gun", "pistol", "submachine gun", "sniper rifle", "shotgun", "grenade", "rocket launcher",
            "missile", "tank", "fighter jet", "helicopter", "bomber", "aircraft carrier", "warship", "submarine",
            "general", "british army", "british navy", "british air force", "british marines", "british special forces",
            "german army", "german navy", "german air force", "german marines", "german special forces"]
for keyword in keywords2:
    print(f"Scraping images about {keyword}...")
    url = f"https://stock.adobe.com/search?k={keyword}"
    soup = get_soup(url)
    images = soup.select("img[src]") # select all img tags that have src attribute
    for i, image in enumerate(images): # loop through all images and save them
        if i == 30:
            break
        image_url = urljoin(url, image["src"])
        image_res = requests.get(image_url)
        image_res.raise_for_status() # raise an exception if the image url is not valid
        image_file = open(f"images/adobe_stock/{keyword}{i}.jpg", "wb")
        for chunk in image_res.iter_content(100000): 
            image_file.write(chunk)
        image_file.close()
    print(f"Done scraping images about {keyword}!")

print("Done scraping images!")

Scraping images about rifle...
Done scraping images about rifle!
Scraping images about machine gun...
Done scraping images about machine gun!
Scraping images about pistol...
Done scraping images about pistol!
Scraping images about submachine gun...
Done scraping images about submachine gun!
Scraping images about sniper rifle...
Done scraping images about sniper rifle!
Scraping images about shotgun...
Done scraping images about shotgun!
Scraping images about grenade...
Done scraping images about grenade!
Scraping images about rocket launcher...
Done scraping images about rocket launcher!
Scraping images about missile...
Done scraping images about missile!
Scraping images about tank...
Done scraping images about tank!
Scraping images about fighter jet...
Done scraping images about fighter jet!
Scraping images about helicopter...
Done scraping images about helicopter!
Scraping images about bomber...
Done scraping images about bomber!
Scraping images about aircraft carrier...
Done scraping

### After scraping these images
#### We need to prepare them for our model
#### To do this first we resize them to 160*160 and rename them for convenience
#### After all these we create grayscale versions of these images and put them in another folder


In [None]:
import sys
import shutil
# these are helper functions to resize, rename and grayscale images to prepare them for the model
def resize_images(source_path, target_path):
    """
    this function resizes all images in source_path to 160x160 and saves them in target_path
    args:
        source_path: the path of the folder that contains the images
        target_path: the path of the folder that will contain the resized images
    """
    if not os.path.exists(source_path):
        print('No source_path folder found.')
        return
    if not os.path.exists(target_path): # create the target folder if it doesn't exist
        os.mkdir(target_path)
    for i, f in enumerate(os.listdir(source_path)):
        try:
            img = Image.open(source_path + f)
            img = img.resize((160, 160), Image.LANCZOS)
            img.save(target_path + f)
        except:
            print('Error resizing image: ' + f)
    print('Done.')

def rename_images(path):
    """ 
    this function renames all images in path to 00001.jpg, 00002.jpg, etc.
    args:
        path: the path of the folder that contains the images"""
    if not os.path.exists(path):
        print('No folder found.')
        return
    for i, f in enumerate(os.listdir(path)):
        shutil.move(path + f, path + str(i + 1).zfill(5) + '.jpg') # rename the image
    print('Done.')

from PIL import Image
def grayscale_images(source_path, target_path):
    """ 
    this function converts all images in source_path to grayscale and saves them in target_path
    args:
        source_path: the path of the folder that contains the images
        target_path: the path of the folder that will contain the grayscale images"""
    if not os.path.exists(source_path):
        print(f'No {source_path} folder found.')
        return
    if not os.path.exists(target_path):
        os.mkdir(target_path)
    for i, f in enumerate(os.listdir(source_path)):
        try:
            img = Image.open(source_path + f).convert('L')
            img.save(target_path + f)
        except:
            print('Error converting image: ' + f)
    print('Done.')

##### resize_images() -> rename_images() -> grayscale_images
##### That was our process