# Get image urls from google sheets


In [126]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials


def connection():
    creds = ServiceAccountCredentials.from_json_keyfile_name("keys.json")
    file = gspread.authorize(creds)
    workbook = file.open("Parser_ImageSize")
    sheet = workbook.sheet1
    return sheet


sheet = connection()

image_urls = sheet.col_values(1)[1:]

Get url's sizes from image urls

In [134]:
import asyncio
import aiohttp
from io import BytesIO
from PIL import Image, UnidentifiedImageError

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}


async def fetch(session, url):
    try:
        async with session.get(url, headers=headers) as response:
            response.raise_for_status()
            return await response.read()
    except aiohttp.ClientError:
        return b""
    
    
async def process_batch(session, image_urls):
    tasks = [fetch(session, image_url) for image_url in image_urls]
    responses = await asyncio.gather(*tasks)
    return responses

async def get_sizes():
    sizes = []
    batch_size = 4000

    async with aiohttp.ClientSession() as session:
        for i in range(0, len(image_urls), batch_size):
            batch_urls = image_urls[i:i + batch_size]
            responses = await process_batch(session, batch_urls)

            for response in responses:
                try:
                    img = Image.open(BytesIO(response))
                    sizes.append(f"{img.size[0]}x{img.size[1]}")
                except UnidentifiedImageError:
                    sizes.append("Incorrect url")
    return sizes

image_sizes = asyncio.run(get_sizes())
%time asyncio.run(get_sizes())

CPU times: total: 15.9 s
Wall time: 4min 23s


['1080x1614',
 '1080x1080',
 '1080x1614',
 '1080x1080',
 '1080x1614',
 '1080x1614',
 '1080x1614',
 '1080x1614',
 '1080x1613',
 '1080x3012',
 '1080x1614',
 '1080x1080',
 '1080x1614',
 'Incorrect url',
 '1080x1607',
 '1080x1720',
 '1080x1765',
 '1080x1765',
 '1080x1080',
 '1080x1720',
 '1080x1607',
 '1080x1621',
 '1080x1526',
 '1080x1620',
 '1080x1667',
 '1080x1579',
 '1080x1764',
 '1080x1080',
 '1080x1786',
 '1080x1613',
 '1080x1613',
 '1080x1613',
 '1080x1613',
 '1080x1613',
 '1080x1613',
 '1080x1613',
 '1080x1613',
 '1080x1080',
 '1080x1080',
 '1080x1614',
 '1080x1080',
 '1080x1614',
 '1080x1613',
 '1080x1613',
 '1080x1613',
 '1080x1613',
 '1080x1080',
 '1080x1080',
 '1080x1080',
 '1080x1080',
 '1080x1100',
 '1093x1080',
 '1093x1080',
 '1093x1080',
 '1093x1080',
 '1093x1080',
 '1093x1080',
 '1080x1080',
 '1080x1080',
 '1080x1080',
 '1080x1080',
 '1080x1080',
 '1080x1080',
 '1715x1080',
 '1093x1080',
 '1093x1080',
 '1093x1080',
 '1093x1080',
 '1093x1080',
 '1093x1080',
 '1093x1080',
 '

Write sizes to google sheet

In [138]:
update_values_sizes = [[image_size] for image_size in image_sizes]

sizes_count = len(image_sizes)
sheet.update(range_name=f"B2:B{sizes_count + 1}", values=update_values_sizes)

{'spreadsheetId': '1GK9xSBOP1iWq0D8CeXV8WT4zJ-IeVP0IRQokLi0xbTE',
 'updatedRange': 'feed!C2:C46889',
 'updatedRows': 46888,
 'updatedColumns': 1,
 'updatedCells': 46888}