# 4. Coding Task: Use Google's API to find pathway images

I divided this coding task into two:

1. Write a function get_images() that (i) Takes in a user query, (ii) Constructs a Google Images URL for the query, (iii) Sends a GET request to the URL, and (iv) Returns the response text as metadata for downstream processing. Additionally, this function attempts to (a) Get the image URLs from the metadata, (b) Send a GET request to each URL, and (c) download these images to a local directory

2. Define a function score_image_quality() that (i) Take the metadata returned by the API for each image and (ii) Returns a quality score based on certain criteria. In this case, I used the file size (less than 10MB) and image resolution (greater than 1080p) as preliminary criteria to rank images worth showing in the Target Detail page.

Finally, I save all the image URLs along with their corresponding quality scores in a dataframe and sort it based on the scores.

In [8]:
import os # To create directory containing downloaded images

# Web scraping tools
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote_plus

# Data handling libraries
import json
import pandas as pd

# Function to find and download pathway images for a query. Returns the response text as metadata
def get_images(query,        # Search query
               num_images,   # Number of images to download
               cx,           # Custom Search Engine ID
               api_key,      # API key
              ):
    # Create a directory to store the downloaded images
    if not os.path.exists('pathway_images'):
        os.makedirs('pathway_images')
    
    # Construct the Google Images URL for the query
    query = quote_plus(query)
    url = f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={cx}&q={query}&searchType=image&num={num_images}'

    # Send a GET request to the URL and parse the response text
    response = requests.get(url)
    metadata = json.loads(response.text)

    # Download the specified number of images
    for i, item in enumerate(metadata['items']):
        try:
            # Get the image URL
            img_url = item['link']

            # Send a GET request to download the image
            img_data = requests.get(img_url).content

            # Save the image to the cat_images directory
            with open(f'pathway_images/pathway{i+1}.png', 'wb') as f:
                f.write(img_data)
                print(f'Downloaded pathway{i+1}.png')

        except Exception as e:
            print(f'Error downloading image: {str(e)}')
            
    return metadata

# Function to get a quality score for each image
def score_image_quality(metadata_item,                     # Metadata for each image entry
                        file_size_param=10 * 1024 * 1024,  # File size threshold (Default: 10MB)
                        resolution_param=(1920,1080),      # Resolution threshold (Default: 1080p)
                       ):
    
    # Get the image specific metadata
    image = metadata_item['image']
    
    # Extract image size and resolution
    image_size = image['byteSize']  # Size of image in bytes
    image_resolution = (image['width'], image['height'])  # Image resolution (width, height)

    # Scoring criteria
    quality_score = 0
    if image_size < file_size_param:  # Smaller file sizes get higher score
        quality_score += 1
    if image_resolution[0] >= resolution_param[0] and image_resolution[1] >= resolution_param[1]:  # High resolution images get high score
        quality_score += 1

    return quality_score

In [5]:
query = 'OPN mediated MMP9 activation' # My query to search for pathway images as shown in Fig 1
num_images = 10 # Number of images to download

cx = '1096c60fac6a44671' # My Search Engine ID
api_key = 'AIzaSyA0xE4JPMLNrpMyzz_ZvFQyc0On_t3aHn8' # My API Key

metadata = get_images(query, num_images, cx, api_key)

Downloaded pathway1.png
Downloaded pathway2.png
Downloaded pathway3.png
Downloaded pathway4.png
Downloaded pathway5.png
Downloaded pathway6.png
Downloaded pathway7.png
Downloaded pathway8.png
Downloaded pathway9.png
Downloaded pathway10.png


In [6]:
# Create a new dataframe to store image URLs and their quality scores
image_scores = pd.DataFrame(columns=['img_url', 'score'])

# Get quality scores for each image and save them in the dataframe
for i,item in enumerate(metadata['items']):
    quality_score = score_image_quality(item)
    
    # Append image_url and score to dataframe
    img_s = {'img_url': item['link'],'score': quality_score}
    image_scores = image_scores.append(img_s, ignore_index=True)

# Current dataframe
image_scores

Unnamed: 0,img_url,score
0,https://www.researchgate.net/publication/34124...,1
1,https://www.mdpi.com/cells/cells-09-01151/arti...,2
2,https://www.researchgate.net/publication/36761...,1
3,https://d3i71xaburhd42.cloudfront.net/22557d38...,1
4,https://www.researchgate.net/publication/36761...,1
5,https://ars.els-cdn.com/content/image/1-s2.0-S...,1
6,https://www.researchgate.net/publication/84625...,1
7,https://pub.mdpi-res.com/ijms/ijms-25-01704/ar...,2
8,https://www.researchgate.net/publication/34124...,1
9,https://pub.mdpi-res.com/biomedicines/biomedic...,2


In [7]:
# Ranked by score
image_scores.sort_values(by='score', ascending=False)

Unnamed: 0,img_url,score
1,https://www.mdpi.com/cells/cells-09-01151/arti...,2
7,https://pub.mdpi-res.com/ijms/ijms-25-01704/ar...,2
9,https://pub.mdpi-res.com/biomedicines/biomedic...,2
0,https://www.researchgate.net/publication/34124...,1
2,https://www.researchgate.net/publication/36761...,1
3,https://d3i71xaburhd42.cloudfront.net/22557d38...,1
4,https://www.researchgate.net/publication/36761...,1
5,https://ars.els-cdn.com/content/image/1-s2.0-S...,1
6,https://www.researchgate.net/publication/84625...,1
8,https://www.researchgate.net/publication/34124...,1
