# Scraping project 
## Maryam Rahbaralam
In this project we want to:
1) Scrape and crawl an eyeglasses website with parse HTML using Beautiful Soup
2) Set up multiprocessing to execute the web scraper in parallel
This website has different eyeglasses with different specifications such as: Brand, Price, Color, Weight, Material.

In [None]:
#First of all we are going to import libraries.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from time import perf_counter, sleep
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from multiprocessing import Pool
import re
import csv
from threading import Thread
from queue import Queu

In [261]:
def get_soup(url):
    #Making an HTTP request and parsing
    #And creating a BeautifulSoup(soup) object with the BeautifulSoup function. 
    page=requests.get(url)
    results=page.content
    soup=BeautifulSoup(results,"lxml")
    return soup

In [262]:
def parse_eyeglasses(url):
    #Extracting the information of all glasses including each link 
    #Then Obtaining the information of each glass
    soup=get_soup(url)
    items=soup.find_all("div",{'class':'item-image'}) 
    eyeglasses_list=[]
    count = 0
    for i in items:
        eyeglasses=i.find('a',{'class':'event-list-link'})
        eyeglasses_link=eyeglasses.get("href") 
        glasses_info = get_info(url + eyeglasses_link) 
        eyeglasses_list.append(glasses_info)

    return eyeglasses_list

In [263]:
def get_info(url):
    #Extracting the Information of glasses: 1)Brand 2)Price 3)color 4)Weight 5)Material 
    soup=get_soup(url)
    
    item= soup.find("h1",{"itemprop":"name"}).next
    Brand=re.sub('\s+', '', item)
    
    item1= soup.find("meta",{"itemprop":"price"})  
    Price=item1.get("content")
   
    item2=soup.find("i",{"class":"im im-details im-color"}).next
    Color=re.sub('Color: ', '', item2)
    
    item3=soup.find("i",{"class":"im im-details im-weight"}).next
    Weight =re.findall('\\d+', item3)[0]
    
    item4=soup.find("a",{"data-event-label":"Materials"})
    temp=item4.get("href")
    Material = re.sub('/eyeglasses/', '', temp)
    
  
    return Brand, Price, Color, Weight, Material 
   

In [264]:
#getting the information of all glasses from the website and measuring the scraping time
url='https://www.eyebuydirect.com/eyeglasses'
toc=time.time()
glasses_info=parse_eyeglasses(url)
tic=time.time()
print("Scraping time: "+str(round(tic-toc,1))+"s")

Scraping time: 46.6s


In [268]:
#Changing the list of glasses into a dataframe(df)
df=pd.DataFrame(glasses_info)
df.columns = ['Brand','Price','Color',
                     'Weight','Material']
#Generating the CSV file
df.to_csv('glasses_info.csv')
df.head(len(glasses_info))


Unnamed: 0,Brand,Price,Color,Weight,Material
0,StMichel,32,Golden,15,metal
1,Chilling,16,Black,11,plastic
2,Bristol,20,Matte Black,21,acetate
3,Dialogue,15,Aubergine,8,plastic
4,NottingHill,30,Blue Floral,18,acetate
5,Chillax,16,Tortoise,12,metal
6,Hepburn,40,Clear/White,18,acetate
7,Botanist,30,Gray Brown,15,acetate
8,Muse,16,Blue Floral,13,plastic
9,Mandi,33,Teal,19,acetate


# Parallelism using Thread library of Python 

The total scraping time is about 38.3s. We are going to use an efficient way to speed up and decrease this time by parallel processing via the multithreading library.
To start we have to get the links of all the pages in the site. 

In [267]:
def parse_eyeglasses_url_list(url):
   #Scraping and Parsing only the main page
    
    soup=get_soup(url)
    items=soup.find_all("div",{'class':'item-image'})
    eyeglasses_url_list=[]
    count = 0
    for i in items:
        eyeglasses=i.find('a',{'class':'event-list-link'})
        eyeglasses_link=eyeglasses.get("href") 
        eyeglasses_url_list.append(url + eyeglasses_link)

    return eyeglasses_url_list

In [251]:
def get_info(url):
    #Scraping and Parsing the other url's of all the different pages
    soup=get_soup(url)
    
    item= soup.find("h1",{"itemprop":"name"}).next
    Brand=re.sub('\s+', '', item)
    
    item1= soup.find("meta",{"itemprop":"price"})  
    Price=item1.get("content")
   
    item2=soup.find("i",{"class":"im im-details im-color"}).next
    Color=re.sub('Color: ', '', item2)
    
    item3=soup.find("i",{"class":"im im-details im-weight"}).next
    Weight =re.findall('\\d+', item3)[0]
    
    item4=soup.find("a",{"data-event-label":"Materials"})
    temp=item4.get("href")
    Material = re.sub('/eyeglasses/', '', temp)
    
  
    return Brand, Price, Color, Weight, Material 

In [252]:
def parallel_scraper(urls):
    #Defining scraper and the queue of the urls
    eyeglasses_list = []
    queue_list = Queue()
    
    for url in urls:
        queue_list.put(url)
        
    for i in range(20):
        #Using 20 threads 
        t = Thread(target = scraper_thread, args = (queue_list, eyeglasses_list))
        t.daemon = True
        t.start()
    queue_list.join()
    return eyeglasses_list

def scraper_thread(queue_list, eyeglasses_list):
    # Given the url list and queues, we divide them between different threads 
    while not queue_list.empty():
        url = queue_list.get()
        glasses_info=get_info(url)
        eyeglasses_list.append(glasses_info)
        queue_list.task_done() #move to another task

In [253]:
url='https://www.eyebuydirect.com/eyeglasses'
toc=time.time()
eyeglasses_url_list = parse_eyeglasses_url_list(url)
glasses_info_par = parallel_scraper(eyeglasses_url_list)
tic=time.time()
print("Scraping time: "+str(round(tic-toc,1))+"s")

Scraping time: 6.6s


# Conclusion
### We can see that the Scraping time is decreased by almost 6 times :)

In [255]:
df_par=pd.DataFrame(glasses_info_par)
df_par.columns = ['Brand','Price','Color',
                     'Weight','Material']
df_par.head(len(results))

Unnamed: 0,Brand,Price,Color,Weight,Material
0,Chilling,16,Black,11,plastic
1,Chillax,16,Tortoise,12,metal
2,Hepburn,40,Clear/White,18,acetate
3,StMichel,32,Golden,15,metal
4,Dialogue,15,Aubergine,8,plastic
5,Muse,16,Blue Floral,13,plastic
6,PaloAlto,33,Silver,13,metal
7,Prism,65,Translucent,16,acetate
8,NottingHill,30,Blue Floral,18,acetate
9,Bristol,20,Matte Black,21,acetate
