# Data Scraping from Ikea Website

In [None]:
#import necessary libraries
from bs4 import BeautifulSoup
import requests
import os
import urllib.request

Provide website link and name of the class for which images needs to be downloaded

In [9]:
website = 'https://www.ikea.com/in/en/cat/beds-bm003/'
data_dir = 'beds'
page = requests.get(website)
soup = BeautifulSoup(page.content, 'html.parser')
soup =  soup.find('div', attrs={'class': 'catalog-product-list'})

In [99]:
# find all links of the subcategories
all_items_outer_div = soup.find('div', attrs={'class' : 'plp-revamp-product-list__product'} )
inner_items_data = all_items_outer_div.find_all('a')

In [100]:
# save product name and it's link in a list 
product_data = []
for item in inner_items_data:
    product_name = item.text.strip()
    product_link = item.attrs['href']
    print(product_name,product_link)
    product_data.append([product_name, product_link])

Double beds https://www.ikea.com/in/en/cat/double-beds-16284/
Guest beds & day beds https://www.ikea.com/in/en/cat/daybeds-19046/
Single beds https://www.ikea.com/in/en/cat/single-beds-16285/
Loft beds & bunk beds loft-beds-bunk-beds
Sofa beds & chair beds https://www.ikea.com/in/en/cat/sofa-beds-10663/
Children's beds https://www.ikea.com/in/en/cat/children-s-beds-18723/
Children's beds 8-12 https://www.ikea.com/in/en/cat/children-s-beds-8-12-24708/


In [101]:
''' 
Download images and save them into their respective directory
    Input: List containing names of subcategory and their page link
    Output: List containing names and link of subcategory which failed to download
'''
def download_images(product_data):
    skipped_data = []
    for product_name, product_link in product_data:
        product_dir = os.path.join(data_dir,product_name)
        if not os.path.exists(product_dir):
            os.makedirs(product_dir)
        next_page_available = True
        counter = 0
        pages_to_scrap = 3
        while pages_to_scrap:
            try:
                product_page = requests.get(product_link)
                soup = BeautifulSoup(product_page.text, 'html.parser')
                next_page_link = soup.find('a', {'class' : 'pagination__right button button--primary' })
                soup =  soup.find("div", {"class": "range-product-list__products"})
                images_data = soup.findAll('img')
                for image_data in images_data:
                    img_link = image_data.attrs['src'][:image_data.attrs['src'].rfind('?')] + '?f=xxxs'
                    urllib.request.urlretrieve(img_link, product_dir + '/' + str(counter) + '.jpg')
                    counter += 1
                if next_page_link is None:
                    break
                product_link = next_page_link.attrs['href']
                pages_to_scrap -= 1
            except Exception as e:
                print(e)
                skipped_data.append( [product_name,product_link] )
                break
        print('Total {} {} images downloaded'.format(counter,product_name))
    return skipped_data

In [102]:
skipped_data = download_images(product_data)

Total 24 Double beds images downloaded
Total 15 Guest beds & day beds images downloaded
Total 26 Single beds images downloaded
Invalid URL 'loft-beds-bunk-beds': No schema supplied. Perhaps you meant http://loft-beds-bunk-beds?
Total 0 Loft beds & bunk beds images downloaded
Total 64 Sofa beds & chair beds images downloaded
Total 40 Children's beds images downloaded
'NoneType' object has no attribute 'findAll'
Total 0 Children's beds 8-12 images downloaded


In [103]:
'''
check for skipped data. Sometimes our selected category contains a category which has sub categories 
inside it. i.e. it has separate page for it.
'''
skipped = None
for product_name, product_link in skipped_data:
    if 'www.ikea.com' not in product_link:
        product_page = requests.get('https://www.ikea.in/' + product_link)
        soup = BeautifulSoup(product_page.text, 'html.parser')
        soup =  soup.find("div", {"class": "product_cat_gallery"})
        all_items_outer_div = soup.find("div", {"class" : "row justify-content-center"} )
        inner_items_data = all_items_outer_div.find_all('a')
        product_data = []
        for item in inner_items_data:
            name = item.text.strip()
            link = item.attrs['href']
            print(name,link)
            product_data.append([name, link])
        skipped = download_images(product_data)
print(skipped)

Bunkbeds https://www.ikea.com/in/en/cat/bunkbeds-19048/
Loft beds https://www.ikea.com/in/en/cat/loft-beds-19049/
Total 3 Bunkbeds images downloaded
Total 8 Loft beds images downloaded
[]
