## Importing Libraries

In [1]:
import pandas as pd              #for dataframe creation             
from bs4 import BeautifulSoup    #for web scraping
import requests                  #for downloading the webpage or making connection 
from tqdm import tqdm            #to check progress for loop

## Obtain product links from Boxspringbed Homepage
- Total Base Boxspringbeds on the webpage : 17

In [2]:
def product_base_links():
    bed_base_link = 'https://www.moebelfreude.de/'
    url = "https://www.moebelfreude.de/boxspringbetten"
    response = requests.get(url)
    page_content=response.text
    doc=BeautifulSoup(page_content, 'html.parser')
    #scrape the base boxspringbed URL
    product_tags=doc.find_all('a', class_="image-link")    
    
    #substring 'bett/' is searched to get the exact boxspringbed count of 17
    substr = 'bett/'        
    product_links = []
    for i in range(len(product_tags)):
        product_suffix = str(product_tags[i].get('href'))
        if substr in product_suffix:
            link = bed_base_link+product_suffix
            product_links.append(link)
    product_links = list(set(product_links))
    
    return product_links

## From the obtained homepage links, getting the available sizes, colours and hardness values

In [3]:
def get_link_blocks(doc):
    available_chs = doc.find_all('span', class_="label label-variation")
    list_chs = [a.text for a in available_chs]
    
    substring = 'cm'
    size_list = [chs for chs in list_chs if substring in chs]
    
    ch_list = [sh for sh in list_chs if sh not in size_list]
    ch_list = [ch.replace("/","-").lower() for ch in ch_list]
    
    #separate color and hardness -->  obtained using replace, split and strip methods
    substr = [' h2',' h3',' h2-h3']
    new_ch = []
    for ch in ch_list:
        for s in substr:
            if s in ch:
                new_ch.append(ch.replace(s,('/'+s.strip())).strip().split('/'))
    #new size list with '-' and proper format is obtained
    new_size = []
    for s in size_list:
        link=('-'+s.replace('x','-x-').replace('cm','-cm-')).strip()
        new_size.append(link)
    
    return new_ch, new_size

## Preparing links by combinations of colours and sizes for each bed

In [4]:
def get_item_urls(base_url,new_ch,new_size):
    return_dict={'LINK':[],'SIZE':[],'HARDNESS':[],'COLOR':[]}

    for ch in new_ch:
        for s in new_size:
# https://www.moebelfreude.de/bett/boxspringbett-bea+ - +anthrazit + -90-x-200-cm- +-h2
            link = str(base_url+'-'+ch[0]+s+ch[1]).replace(' ','-').replace('--','-')
            return_dict['LINK'].append(link)
            return_dict['SIZE'].append(s)
            return_dict['HARDNESS'].append(ch[1])
            return_dict['COLOR'].append(ch[0])

    return_df = pd.DataFrame.from_dict(return_dict)
    return return_df

## Functions to obtain the Title name and Price

In [5]:
#get title from the webpage
def get_Title(doc):              
    name_class = "h1"
    name_tags = doc.find_all('div', class_=name_class)
    title = name_tags[0].text
    
    return str(title)

In [6]:
#get price from the respective url of each bed with its parameters
def get_Price(doc):
    price_class = 'price'
    price_tags = doc.find_all('span', class_=price_class)
    price = price_tags[0].text.strip()
    
    return str(price)

## Main function to regulate the flow of the code

- Step 1: Creation of base links from the boxspringbed home webpage
- Step 2: Gathered following parameters: Title, Color, size, hardness for each obtained boxspringbed link 
- Step 3: Format the parameters to create the links for each individual bed with their size and colors
- Step 4: Obtained price for each of them
- Step 5: Compute the availability and create the resulting DataFrame
- Step 6: Export the results to CSV

In [8]:
def main():
    product_links = product_base_links()
    result_dictionary={'LINK':[],'TITLE':[],'SIZE':[],'HARDNESS':[],'COLOR':[],'PRICE':[],'AVAILABILITY':[]}

    for url in tqdm(product_links):

        response = requests.get(url)
        page_content=response.text
        doc=BeautifulSoup(page_content, 'html.parser')
        base_url = url
        
        title=get_Title(doc)
        ch_list, size_list = get_link_blocks(doc)
        
        final_urls = get_item_urls(base_url,ch_list, size_list)

        for i in range(len(final_urls)):
            url = final_urls['LINK'].iloc[i]
            response = requests.get(url)
            response_code = response.status_code

            if response_code in range(200,300):
                page_content=response.text
                doc=BeautifulSoup(page_content, 'html.parser')
                result_dictionary['LINK'].append(url)
                result_dictionary['TITLE'].append(title)
                result_dictionary['SIZE'].append(str(final_urls['SIZE'].iloc[i]).replace('-',''))
                result_dictionary['HARDNESS'].append(str(final_urls['HARDNESS'].iloc[i]).upper())
                result_dictionary['COLOR'].append(final_urls['COLOR'].iloc[i])
                result_dictionary['PRICE'].append(get_Price(doc))
                result_dictionary['AVAILABILITY'].append('Y')
            else:
                page_content=response.text
                doc=BeautifulSoup(page_content, 'html.parser')
                result_dictionary['LINK'].append(url)
                result_dictionary['TITLE'].append(title)
                result_dictionary['SIZE'].append(str(final_urls['SIZE'].iloc[i]).replace('-',''))
                result_dictionary['HARDNESS'].append(str(final_urls['HARDNESS'].iloc[i]).upper())
                result_dictionary['COLOR'].append(final_urls['COLOR'].iloc[i])
                result_dictionary['PRICE'].append('NA')
                result_dictionary['AVAILABILITY'].append('N')`
    
    df = pd.DataFrame.from_dict(result_dictionary)
    df=df.drop_duplicates(subset=['LINK'])
    df.to_csv('Python_web_scraping_results.csv',index=False)
    
    print('CSV File Generated Successfully!')

In [9]:
main()

100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [14:44<00:00, 52.01s/it]

CSV File Generated Successfully!



