In [1]:
import csv
import json
import html
import time

import requests
import pandas as pd
from bs4 import BeautifulSoup
from os import mkdir

## 4. IMDB Gender images:

Suppose we want to build a data set for a Computer vision task that involves gender images. 
Your tasks are the following:
- Collect 10k male/female images from:
https://www.imdb.com
- Make sure to render the whole page using selenium and then use BeautifulSoup  to scrape the images
- Create a folder for male/female
- Each image will be named after the person in the picture


In [26]:
def process_page_fullscale_imdb(gender, start=1):
    url = 'https://www.imdb.com/search/name/?gender=' + gender + '&count=100&start=' + str(start)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers, timeout=5)
    except:       
        return {'failed':str(start)}        
        
    soup_main = BeautifulSoup(response.content, 'html.parser')
    
    list_of_links = []
    for item in soup_main.find_all('div', class_='lister-item mode-detail'):
        list_of_links.append(item.find('a')['href'])
        
    dict_actors = {}
    dict_errors = {}
    for link in list_of_links:
        url = 'https://www.imdb.com' + link
        try:
            response = requests.get(url, headers=headers, timeout=5)
            soup_actor = BeautifulSoup(response.content, 'html.parser')
            link_final = soup_actor.find('a', class_='ipc-lockup-overlay ipc-focusable')['href']
            try:
                url = 'https://www.imdb.com' + link_final
                response = requests.get(url, headers=headers, timeout=5)
                soup_image = BeautifulSoup(response.content, 'html.parser')
                link_image = soup_image.find('div', class_='sc-7c0a9e7c-2 kEDMKk').find('img')['src']
                name = soup_image.title.contents[0]
                dict_actors[name] = link_image
            except:
                dict_errors[link] = 'error'
        except:
            dict_errors[link] = 'error'
        
        print(name[0] if name else 'error', end=',')
    
    return {'actors':dict_actors, 'errors':dict_errors}


def process_page_thumbs_imdb(gender, start):
    url = 'https://www.imdb.com/search/name/?gender=' + gender + '&count=100&start=' + str(start)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    
    dict_actors = {}
    list_errors = []
    try:
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.content, 'html.parser')        
        for item in soup.find_all('div', class_='lister-item-image'):
            actor = item.find('img')['alt']
            link_thumb = item.find('img')['src']
            dict_actors[actor] = link_thumb
    except:       
        list_errors.append(start)     
    
    return dict_actors, list_errors
    

In [30]:
dict_imdb_links = {'male':{}, 'female':{}}
dict_errors = {'male':[], 'female':[]}

for i in range(1, 10001, 100):
    actors, errors = process_page_thumbs_imdb(gender='male', start=i)
    dict_imdb_links['male'].update(actors)
    dict_errors['male'].extend(errors)
    print('male: from {} to {} has been processed'.format(i, i + 99))

for i in range(1, 10001, 100):
    actors, errors = process_page_thumbs_imdb(gender='female', start=i)
    dict_imdb_links['female'].update(actors)
    dict_errors['female'].extend(errors)
    print('female: from {} to {} has been processed'.format(i, i + 99))


male: from 1 to 100 has been processed
male: from 101 to 200 has been processed
male: from 201 to 300 has been processed
male: from 301 to 400 has been processed
male: from 401 to 500 has been processed
male: from 501 to 600 has been processed
male: from 601 to 700 has been processed
male: from 701 to 800 has been processed
male: from 801 to 900 has been processed
male: from 901 to 1000 has been processed
male: from 1001 to 1100 has been processed
male: from 1101 to 1200 has been processed
male: from 1201 to 1300 has been processed
male: from 1301 to 1400 has been processed
male: from 1401 to 1500 has been processed
male: from 1501 to 1600 has been processed
male: from 1601 to 1700 has been processed
male: from 1701 to 1800 has been processed
male: from 1801 to 1900 has been processed
male: from 1901 to 2000 has been processed
male: from 2001 to 2100 has been processed
male: from 2101 to 2200 has been processed
male: from 2201 to 2300 has been processed
male: from 2301 to 2400 has been

female: from 8801 to 8900 has been processed
female: from 8901 to 9000 has been processed
female: from 9001 to 9100 has been processed
female: from 9101 to 9200 has been processed
female: from 9201 to 9300 has been processed
female: from 9301 to 9400 has been processed
female: from 9401 to 9500 has been processed
female: from 9501 to 9600 has been processed
female: from 9601 to 9700 has been processed
female: from 9701 to 9800 has been processed
female: from 9801 to 9900 has been processed
female: from 9901 to 10000 has been processed


In [34]:
len(dict_imdb_links['male'])

9991

In [32]:
dict_errors

{'male': [4901, 5101], 'female': []}

In [36]:
for i in dict_errors['male']:
    actors, errors = process_page_thumbs_imdb(gender='male', start=i)
    dict_imdb_links['male'].update(actors)
    dict_errors['male'].extend(errors)
    print('male: from {} to {} has been processed'.format(i, i + 99))

male: from 4901 to 5000 has been processed
male: from 5101 to 5200 has been processed


In [39]:
print(len(dict_imdb_links['male']))
print(len(dict_imdb_links['female']))


9977
9991


In [40]:
# Writing the json file:

file_json = 'assignment4_IMDB_thumbs.json'
with open(file_json, "w") as outfile:
    json.dump(dict_imdb_links, outfile)
    print(file_json, 'is written succesfully!')

assignment4_IMDB_thumbs.json is written succesfully!


In [44]:
mkdir('imdb')
mkdir('imdb/male')
mkdir('imdb/female')

In [49]:
imdb_download_report = {'success':[], 'error':[]}

for link in dict_imdb_links['male']:
    name = link
    url = dict_imdb_links['male'][link]
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    print(name[0], end=',')
    try:        
        response = requests.get(url, headers=headers, timeout=6)
        if response.status_code == 200:
            picture = open('imdb/male/' + name + '.jpg', 'wb')
            picture.write(response.content)
            imdb_download_report['success'].append(name)
            picture.close()
        else:
            imdb_download_report['error'].append((name, response.status_code))
    except requests.exceptions.Timeout:
        imdb_download_report['error'].append((name, 'timeout'))
    
#     # DELETE IF YOU WANT TO DOWNLOAD MORE
#     if index == 50:
#         break

print('success :', len(imdb_download_report['success']))
print('errors :', len(imdb_download_report['error']))

J,R,C,R,P,A,T,N,A,N,J,W,G,C,H,N,W,C,K,D,K,P,T,T,B,A,C,S,K,C,J,J,B,P,S,J,B,D,A,R,A,N,G,C,L,S,J,J,G,J,T,W,K,D,C,A,N,H,M,R,V,J,A,C,Y,J,C,J,B,T,D,B,J,N,B,M,B,C,M,C,Q,B,C,S,M,A,R,S,M,C,B,J,J,R,K,B,A,R,A,G,A,R,T,J,L,P,A,T,P,T,A,J,A,A,R,M,J,S,S,R,C,J,R,L,M,R,K,T,M,A,J,D,D,M,A,B,J,A,D,T,M,J,J,F,I,M,M,B,S,J,W,M,S,B,M,R,R,S,E,E,M,V,B,H,D,G,J,L,G,T,J,D,C,J,T,S,R,M,M,J,Z,N,B,M,E,K,J,T,J,J,D,R,O,R,E,J,D,J,M,M,H,T,M,R,R,J,W,S,B,J,R,E,J,J,O,K,L,C,M,S,J,R,C,P,W,I,S,V,S,G,C,J,S,T,D,C,J,J,D,P,E,J,R,I,R,S,D,G,V,J,A,J,J,L,L,A,H,H,H,D,K,N,S,S,J,J,J,B,M,J,W,M,J,D,F,T,A,S,L,D,C,A,B,M,M,N,C,J,B,N,S,J,J,J,K,B,J,Z,E,M,J,M,S,H,M,J,A,J,M,S,D,T,R,E,M,D,D,A,L,W,P,W,A,J,B,J,S,L,J,M,P,E,D,J,K,P,B,S,M,T,B,A,N,B,E,W,B,W,J,B,A,C,J,E,A,S,S,T,A,J,R,S,J,K,R,P,M,P,D,D,P,I,A,Z,S,J,M,J,J,R,N,V,D,J,J,K,J,E,M,D,P,L,N,T,R,R,M,G,L,M,J,W,M,M,J,S,P,I,K,G,O,B,D,W,S,G,F,A,P,J,W,P,T,O,A,B,A,T,H,H,E,F,J,G,N,M,F,G,S,B,J,R,P,J,D,R,C,D,J,J,H,J,K,A,G,C,P,H,C,T,J,M,S,T,J,T,T,J,P,G,N,D,R,J,D,A,H,G,B,T,C,B,A,J,C,J,P,G,A,C,T,B,T,A,C,L,B,J,H,S,

K,C,J,B,Z,I,J,M,P,',C,M,R,B,J,J,J,R,R,L,C,J,D,A,J,B,N,J,J,N,G,A,A,H,A,G,Z,S,C,A,C,D,R,P,T,W,C,S,R,K,C,G,T,F,P,B,L,T,Z,D,R,F,G,R,A,D,F,S,J,P,B,M,J,T,R,J,J,S,R,P,H,G,C,R,J,A,D,C,T,P,M,L,D,J,M,B,M,T,L,W,L,J,E,L,S,M,I,T,G,B,K,S,J,A,D,C,E,B,J,R,R,R,R,N,B,K,K,L,R,P,C,M,A,T,C,J,E,D,A,F,J,A,R,P,S,B,C,S,H,R,J,J,J,M,B,L,M,D,J,G,A,J,K,D,J,T,J,G,M,B,N,J,L,S,S,R,L,P,R,X,L,J,S,N,G,O,I,D,J,J,O,H,M,J,H,C,M,I,N,J,A,R,R,W,P,G,J,P,F,D,F,S,S,N,K,M,A,B,L,S,W,M,R,P,D,C,R,J,R,C,D,R,L,M,M,A,G,C,T,L,B,G,P,G,R,E,G,D,J,R,J,P,A,R,D,K,D,T,K,H,R,A,M,D,J,C,B,J,D,M,K,C,J,R,A,J,A,J,I,P,M,M,N,M,G,R,H,E,E,J,S,R,J,D,A,P,N,O,J,J,W,E,P,M,K,B,L,R,M,D,C,B,A,J,D,J,J,B,E,F,N,G,I,J,T,W,B,J,R,J,M,L,J,J,D,D,G,A,B,M,K,C,D,D,S,E,A,M,K,J,D,R,A,J,S,D,D,K,S,N,N,E,I,D,T,T,J,J,D,B,K,T,A,J,W,D,G,R,D,M,S,J,G,C,E,K,F,N,J,C,S,A,D,C,M,E,R,K,B,B,J,E,R,H,Y,L,D,M,J,C,B,T,M,N,N,C,T,H,A,W,I,D,T,B,P,D,D,K,R,D,M,U,K,N,B,R,M,B,A,B,B,F,C,R,D,C,S,J,J,P,T,R,O,B,I,D,T,R,K,M,C,A,P,J,C,M,T,G,P,T,G,B,H,A,E,S,E,D,B,K,N,J,C,C,G,N,D,T,R,J,K,D,L,D,H,P,M,B,S,P,

P,S,E,D,C,J,C,W,C,B,R,B,M,E,T,S,P,E,I,M,D,B,M,J,B,Z,K,B,A,J,K,J,G,M,J,S,J,F,D,C,P,R,J,L,G,W,H,R,T,S,A,G,C,Z,A,J,C,G,L,R,P,Y,J,M,N,M,W,D,R,D,D,J,R,L,H,B,A,D,G,J,G,J,N,D,J,B,G,D,A,E,S,M,P,P,L,D,R,A,F,A,J,M,A,E,B,N,J,L,A,I,B,T,W,M,I,G,N,S,D,S,S,J,M,C,H,G,I,J,P,F,B,M,S,V,M,L,L,Y,B,C,K,J,C,C,R,G,M,J,J,J,C,A,R,T,M,T,B,S,J,K,B,H,C,D,Z,P,J,A,J,M,C,M,M,R,D,H,B,D,A,A,M,P,N,M,L,K,J,J,R,J,K,S,M,M,T,T,P,B,D,G,A,G,A,S,A,W,J,S,J,M,E,J,J,K,M,N,J,S,A,F,K,G,S,K,T,J,R,J,A,R,T,M,G,E,S,M,M,J,W,S,V,D,K,F,T,L,N,Z,R,J,D,R,M,T,M,E,D,A,E,D,R,D,J,P,A,W,D,S,T,C,D,K,C,L,L,F,M,J,O,J,T,D,J,A,J,D,M,W,B,K,M,M,L,M,J,G,D,R,G,T,R,N,R,J,L,D,B,G,J,L,B,B,D,S,D,G,C,E,M,J,P,L,D,S,N,Y,A,J,T,M,R,W,A,L,M,M,D,P,C,M,A,J,J,B,J,A,J,D,T,M,M,M,T,N,H,A,J,D,J,T,M,R,M,P,D,J,A,D,C,R,P,J,P,V,S,J,V,R,J,A,P,J,S,G,C,R,D,R,T,S,J,B,M,R,A,M,N,M,M,W,G,M,B,E,F,K,A,K,J,C,S,M,M,T,F,G,B,R,M,A,M,P,A,A,G,J,L,M,J,P,N,M,M,T,J,M,A,D,J,T,M,J,I,N,D,D,C,J,D,G,T,G,A,C,R,S,A,B,B,C,M,T,A,W,V,J,T,T,P,B,M,B,R,T,E,P,K,S,G,P,K,B,A,L,J,A,M,J,S,A,D,H,C,G,S,P,M,S,B,N,

In [12]:
imdb_download_report = {'success':[], 'error':[]}

for link in dict_imdb_links['female']:
    name = link
    url = dict_imdb_links['female'][link]
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    print(name[0], end=',')
    try:        
        response = requests.get(url, headers=headers, timeout=6)
        if response.status_code == 200:
            picture = open('imdb/female/' + name + '.jpg', 'wb')
            picture.write(response.content)
            imdb_download_report['success'].append(name)
            picture.close()
        else:
            imdb_download_report['error'].append((name, response.status_code))
    except requests.exceptions.Timeout:
        imdb_download_report['error'].append((name, 'timeout'))
    
#     # DELETE IF YOU WANT TO DOWNLOAD MORE
#     if index == 50:
#         break

print('success :', len(imdb_download_report['success']))
print('errors :', len(imdb_download_report['error']))

A,K,P,P,R,E,S,E,R,K,F,K,D,J,H,L,B,J,L,T,A,A,A,S,H,S,A,C,B,E,L,R,A,M,M,A,I,G,A,E,D,J,S,K,C,M,B,D,Z,K,H,C,S,K,S,J,E,D,H,Z,A,Y,M,M,S,A,J,B,E,S,K,M,M,A,D,S,R,A,R,K,C,C,A,R,A,G,K,A,J,L,E,R,N,M,S,M,L,R,L,M,M,L,M,J,R,A,M,D,E,L,C,C,H,D,A,K,A,E,D,D,B,T,A,K,E,M,B,C,L,L,I,M,M,J,C,A,A,J,C,R,M,A,A,B,S,A,A,J,J,A,A,E,J,A,K,S,J,A,M,U,N,V,A,L,J,K,T,A,A,B,M,E,E,J,M,C,C,D,M,A,V,M,S,C,A,E,M,E,K,H,J,V,G,N,P,S,M,M,H,H,E,D,D,K,J,A,J,C,G,S,K,D,A,R,M,A,C,J,S,S,S,E,B,J,G,R,O,L,A,E,E,S,K,R,A,M,M,T,B,T,O,S,A,Z,J,K,Y,K,G,O,R,J,S,A,L,E,G,A,N,N,Z,E,K,E,A,O,D,E,M,C,A,K,M,S,J,A,A,A,A,M,K,E,E,O,A,W,M,J,B,C,N,K,H,S,R,M,M,C,C,G,R,M,J,S,M,D,T,N,D,S,C,J,J,P,J,I,J,N,K,N,B,H,Z,J,G,C,K,F,O,B,T,A,L,E,A,K,S,C,J,J,R,N,E,E,C,M,R,M,M,L,A,R,A,N,E,K,T,J,M,G,A,W,S,G,O,L,M,R,R,H,J,C,J,S,D,J,H,K,S,F,S,J,O,L,I,D,S,Z,M,A,S,S,M,L,S,A,A,P,B,J,A,J,J,K,J,J,R,M,M,A,E,E,P,F,S,G,E,S,P,K,F,H,H,W,M,P,C,R,E,M,H,J,A,D,I,G,M,A,K,H,M,D,J,A,T,E,L,D,A,J,P,T,A,E,V,G,J,J,C,O,Z,D,M,R,I,E,C,S,J,C,D,T,T,V,B,S,S,J,A,L,P,R,S,B,A,E,C,S,E,R,V,J,P,S,D,G,C,R,R,G,

M,L,O,L,P,P,S,K,L,M,S,C,S,P,K,L,D,L,A,K,S,B,H,W,P,R,S,D,S,L,S,J,K,N,K,K,J,V,P,H,L,Z,V,J,M,F,M,A,S,A,J,V,M,L,K,C,M,A,L,E,R,C,A,S,M,J,R,E,R,M,M,N,J,M,M,S,K,J,T,L,M,J,W,T,K,L,Q,S,M,J,E,J,S,A,A,K,A,A,C,A,C,V,W,M,L,S,A,V,S,D,L,V,J,A,A,O,A,N,P,L,C,K,J,M,H,K,J,H,S,M,B,M,A,K,K,D,K,S,C,M,N,D,K,P,A,S,V,E,A,L,N,M,V,L,C,I,S,C,L,C,M,C,P,Z,A,B,M,J,A,G,H,J,B,S,P,S,Y,L,R,E,A,V,S,K,D,J,K,F,K,D,F,M,S,A,M,A,J,O,K,C,A,T,D,A,F,R,B,S,S,T,N,S,J,D,J,A,D,B,S,J,G,J,F,K,T,M,J,D,M,H,K,R,S,J,L,D,K,E,I,K,S,J,B,Z,B,I,M,J,T,D,S,J,J,A,M,M,N,A,E,L,M,L,E,J,D,D,A,J,S,S,A,V,C,C,A,N,G,C,K,A,A,G,K,K,T,F,A,K,F,D,A,H,E,K,S,K,S,G,T,T,K,A,L,B,M,L,D,R,A,T,L,S,S,F,M,A,A,L,A,T,J,E,R,C,E,L,N,G,B,L,B,A,L,N,V,H,J,L,K,S,A,W,R,C,I,R,E,S,S,G,J,J,A,L,J,K,A,J,A,A,A,K,K,S,J,L,V,T,L,M,D,L,S,G,J,J,S,T,L,T,C,N,R,A,B,A,J,J,C,G,A,O,B,V,S,T,J,M,A,J,S,J,M,C,M,S,J,N,J,S,A,A,K,M,J,C,L,C,K,D,G,P,K,L,R,M,S,K,J,K,S,S,A,K,G,J,K,V,E,J,S,L,M,M,L,C,A,A,K,L,S,B,Z,M,E,N,E,K,L,R,J,S,T,K,S,K,B,R,C,A,C,H,J,E,R,P,G,K,T,A,N,C,R,P,E,M,H,M,J,F,L,J,E,R,D,H,B,M,M,A,

E,M,L,B,C,T,J,H,A,K,T,A,G,S,J,S,H,I,C,H,M,D,M,C,N,S,S,M,L,E,L,S,J,A,F,A,K,D,D,M,A,D,K,F,A,I,B,M,J,S,M,C,V,L,P,J,A,E,L,C,G,S,S,K,S,M,J,V,S,C,M,A,A,L,B,V,M,N,A,A,R,R,R,C,B,A,V,C,G,H,C,A,P,J,T,E,S,A,R,C,A,L,C,E,L,H,S,S,J,P,C,A,C,V,O,K,R,O,A,H,A,K,B,J,P,D,L,L,M,K,J,R,E,M,G,C,M,B,N,L,A,A,P,O,D,A,A,M,H,S,S,K,E,T,M,A,S,M,J,K,T,E,L,A,H,P,K,S,E,C,A,M,N,M,C,E,Y,K,I,L,P,K,Y,R,M,T,L,V,A,E,G,V,H,B,A,K,B,S,M,K,B,L,M,K,B,C,V,S,E,L,J,F,A,T,L,J,D,C,A,R,G,G,T,M,F,I,A,J,L,C,S,O,P,E,C,P,G,C,J,J,R,A,D,C,U,J,P,D,M,D,H,E,D,S,J,D,K,K,A,J,N,M,M,J,K,N,S,C,A,A,C,K,M,J,A,J,J,D,S,M,J,N,S,E,C,G,F,J,L,C,R,C,M,A,P,S,N,T,A,B,P,R,T,I,M,A,A,S,J,O,S,C,Y,T,T,S,C,J,A,A,D,H,L,R,C,D,S,B,M,L,J,A,L,T,J,A,E,J,G,L,K,V,S,L,M,S,C,J,A,J,L,M,T,B,Ö,A,L,K,J,M,T,L,M,T,M,C,K,M,W,M,P,K,J,F,K,B,K,C,K,S,M,R,M,R,D,J,H,T,S,M,M,S,L,F,A,L,M,J,M,J,J,H,A,R,A,A,J,M,A,K,A,D,J,B,R,N,R,M,Z,M,E,P,J,S,R,M,E,S,C,B,H,A,C,A,K,K,C,B,T,A,E,N,C,V,A,H,J,P,M,B,J,L,L,B,L,J,M,E,A,C,Z,M,B,C,M,M,P,R,A,I,E,J,K,A,C,A,A,J,V,M,L,L,H,V,A,R,L,K,Y,A,J,J,D,L,R,A,M,M,P,V,

In [18]:
print(len(imdb_download_report['success']))
print('errors :', len(imdb_download_report['error']))

9991
errors : 0
