In [1]:
import csv
import json
import html
import time

import requests
import pandas as pd
from bs4 import BeautifulSoup
from os import mkdir

## 4. IMDB Gender images:

Suppose we want to build a data set for a Computer vision task that involves gender images. 
Your tasks are the following:
- Collect 10k male/female images from:
https://www.imdb.com
- Make sure to render the whole page using selenium and then use BeautifulSoup  to scrape the images
- Create a folder for male/female
- Each image will be named after the person in the picture


In [26]:
def process_page_fullscale_imdb(gender, start=1):
    '''return dictionary with list of full scale images: title and link
    (processing of one 100 items imdb page)'''
    
    url = 'https://www.imdb.com/search/name/?gender=' + gender + '&count=100&start=' + str(start)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers, timeout=5)
    except:       
        return {'failed':str(start)}        
        
    soup_main = BeautifulSoup(response.content, 'html.parser')
    
    list_of_links = []
    for item in soup_main.find_all('div', class_='lister-item mode-detail'):
        list_of_links.append(item.find('a')['href'])
        
    dict_actors = {}
    dict_errors = {}
    for link in list_of_links:
        url = 'https://www.imdb.com' + link
        try:
            response = requests.get(url, headers=headers, timeout=5)
            soup_actor = BeautifulSoup(response.content, 'html.parser')
            link_final = soup_actor.find('a', class_='ipc-lockup-overlay ipc-focusable')['href']
            try:
                url = 'https://www.imdb.com' + link_final
                response = requests.get(url, headers=headers, timeout=5)
                soup_image = BeautifulSoup(response.content, 'html.parser')
                link_image = soup_image.find('div', class_='sc-7c0a9e7c-2 kEDMKk').find('img')['src']
                name = soup_image.title.contents[0]
                dict_actors[name] = link_image
            except:
                dict_errors[link] = 'error'
        except:
            dict_errors[link] = 'error'
        
        print(name[0] if name else 'error', end=',')
    
    return {'actors':dict_actors, 'errors':dict_errors}


def process_page_thumbs_imdb(gender, start):
    '''return dictionary with list of  thumb images title and link and list of errors'''
    
    url = 'https://www.imdb.com/search/name/?gender=' + gender + '&count=100&start=' + str(start)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    
    dict_actors = {}
    list_errors = []
    try:
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.content, 'html.parser')        
        for item in soup.find_all('div', class_='lister-item-image'):
            actor = item.find('img')['alt']
            link_thumb = item.find('img')['src']
            dict_actors[actor] = link_thumb
    except:       
        list_errors.append(start)     
    
    return dict_actors, list_errors
    

In [30]:
# mining of list of thumbs for male and female (10,000 each)

dict_imdb_links = {'male':{}, 'female':{}}
dict_errors = {'male':[], 'female':[]}

for i in range(1, 10001, 100):
    actors, errors = process_page_thumbs_imdb(gender='male', start=i)
    dict_imdb_links['male'].update(actors)
    dict_errors['male'].extend(errors)
    print('male: from {} to {} has been processed'.format(i, i + 99))

for i in range(1, 10001, 100):
    actors, errors = process_page_thumbs_imdb(gender='female', start=i)
    dict_imdb_links['female'].update(actors)
    dict_errors['female'].extend(errors)
    print('female: from {} to {} has been processed'.format(i, i + 99))


male: from 1 to 100 has been processed
male: from 101 to 200 has been processed
male: from 201 to 300 has been processed
male: from 301 to 400 has been processed
male: from 401 to 500 has been processed
male: from 501 to 600 has been processed
male: from 601 to 700 has been processed
male: from 701 to 800 has been processed
male: from 801 to 900 has been processed
male: from 901 to 1000 has been processed
male: from 1001 to 1100 has been processed
male: from 1101 to 1200 has been processed
male: from 1201 to 1300 has been processed
male: from 1301 to 1400 has been processed
male: from 1401 to 1500 has been processed
male: from 1501 to 1600 has been processed
male: from 1601 to 1700 has been processed
male: from 1701 to 1800 has been processed
male: from 1801 to 1900 has been processed
male: from 1901 to 2000 has been processed
male: from 2001 to 2100 has been processed
male: from 2101 to 2200 has been processed
male: from 2201 to 2300 has been processed
male: from 2301 to 2400 has been

female: from 8801 to 8900 has been processed
female: from 8901 to 9000 has been processed
female: from 9001 to 9100 has been processed
female: from 9101 to 9200 has been processed
female: from 9201 to 9300 has been processed
female: from 9301 to 9400 has been processed
female: from 9401 to 9500 has been processed
female: from 9501 to 9600 has been processed
female: from 9601 to 9700 has been processed
female: from 9701 to 9800 has been processed
female: from 9801 to 9900 has been processed
female: from 9901 to 10000 has been processed


In [34]:
len(dict_imdb_links['male'])

9991

In [32]:
dict_errors

{'male': [4901, 5101], 'female': []}

In [36]:
# data augmentation by reuse of errors

for i in dict_errors['male']:
    actors, errors = process_page_thumbs_imdb(gender='male', start=i)
    dict_imdb_links['male'].update(actors)
    dict_errors['male'].extend(errors)
    print('male: from {} to {} has been processed'.format(i, i + 99))

male: from 4901 to 5000 has been processed
male: from 5101 to 5200 has been processed


In [39]:
print(len(dict_imdb_links['male']))
print(len(dict_imdb_links['female']))


9977
9991


In [40]:
# Writing the json file:

file_json = 'assignment4_IMDB_thumbs.json'
with open(file_json, "w") as outfile:
    json.dump(dict_imdb_links, outfile)
    print(file_json, 'is written succesfully!')

assignment4_IMDB_thumbs.json is written succesfully!
