In [25]:
from urllib.request import Request, urlopen, urlretrieve
from bs4 import BeautifulSoup
import time

# Scraping Website for Photos

In [91]:
"""Returns a list of image URLs from a given shot type url

Args:
    baseurl (str): The URL of the shot type 
    print_cols (int): Number of pages to scrape from

Returns:
    url_links: a list of URLs to all images of the desired shot type
"""
def scrapeFSR(baseurl, numPages):
    url_links = []
    for i in range(1, numPages+1):
        url = baseurl + str(i)
        #print(url)
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        page = urlopen(req).read()
        page_soup = BeautifulSoup(page, "html.parser")
        articles = page_soup.find_all('article')
        for article in articles:
            imglink = article.find('img').get('src')
            if ".gif" not in imglink:
                url_links.append(imglink)
        time.sleep(1)
    return list(map(lambda url: url.replace('-260x170', ''), url_links));

# Saving Images to File System

In [92]:
import requests # to get image from the web
import shutil # to save it locally

## Set up the image URL and filename
def saveImages(urlArray, filepath):
    for image_url in urlArray:
        filename = image_url.split("/")[-1]

        # Open the url image, set stream to True, this will return the stream content.
        r = requests.get(image_url, headers={'User-Agent': 'Mozilla/5.0'}, stream = True)
        # Check if the image was retrieved successfully
        if r.status_code == 200:
            # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
            r.raw.decode_content = True

            # Open a local file with wb ( write binary ) permission.
            with open(filepath+filename,'wb') as f:
                shutil.copyfileobj(r.raw, f)
        else:
            print(image_url)
        time.sleep(0.5)

# Close Up Shot Scraping

In [93]:
baseurl = "https://shots.filmschoolrejects.com/shot-types/close-up/page/"
numPages = 28
closeup_url_links = scrapeFSR(baseurl, numPages)

In [94]:
saveImages(closeup_url_links, "E:\\Training Data\\Closeups\\")

In [95]:
len(closeup_url_links)

257

# Long Shot Scraping

In [96]:
baseurl = "https://shots.filmschoolrejects.com/shot-types/long-shot/page/"
numPages = 65
long_shot_url_links = scrapeFSR(baseurl, numPages)

In [97]:
saveImages(long_shot_url_links, "E:\\Training Data\\Long Shots\\")

In [98]:
len(long_shot_url_links)

629

In [90]:
long_shot_url_links
list(map(lambda url: url.replace('-260x170', ''), long_shot_url_links))

['https://shots.filmschoolrejects.com/wp-content/uploads/2015/05/branded-to-kill.jpg',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2015/06/manhattan1.jpg',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2015/06/an-autumn-afternoon.jpg',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2015/06/halloween2.jpg',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2015/06/twinpeaks.png',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2015/06/once-upon-a-time-in-china.jpg',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2015/05/repulsion.png',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2019/04/black-panther.png',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2015/05/metropolis1.jpg',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2019/04/hunt-for-the-wilderpeople.png',
 'https://shots.filmschoolrejects.com/wp-content/uploads/2019/04/swamp-thing-movie.jpg',
 'https://shots.filmschoolrejects.com/wp-cont

# Medium Shot Scraping

In [99]:
baseurl = "https://shots.filmschoolrejects.com/shot-types/medium-shot/page/"
numPages = 28
medium_shot_url_links = scrapeFSR(baseurl, numPages)

In [100]:
saveImages(medium_shot_url_links, "E:\\Training Data\\Medium Shots\\")

In [101]:
len(medium_shot_url_links)

256

In [62]:
def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    return a_set.intersection(b_set)

In [59]:
common_member(closeup_url_links, medium_shot_url_links)

{'https://shots.filmschoolrejects.com/wp-content/uploads/2016/01/maniac-260x170.jpg'}

In [60]:
closeup_url_links.remove('https://shots.filmschoolrejects.com/wp-content/uploads/2016/01/maniac-260x170.jpg')

In [61]:
common_member(medium_shot_url_links, closeup_url_links)

set()