# This notebook scrapes data from the Daniel Smith website and returns the information for insertion in to the SQL database

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:

url = "http://danielsmith.com/category/wc5ml/page/4"

webpage_response = requests.get(url, "html.parser")
soup = BeautifulSoup(webpage_response.content, 'xml')

In [3]:
#h4 is the tag on the main page that contains the url for the individual paints and the name
meta = soup.find_all('h4')

In [4]:
colors = []

In [5]:
for i in range(0,len(meta)):
    #print(meta[i])
    title = meta[i].find('a').string
    title_strip = title.split("5")[0].strip()
    href = meta[i].find('a').attrs['href']
    colors.append([title_strip,href])
    print(title_strip)
    print(href)

Perylene Green
http://danielsmith.com/perylene-green-5ml-tube-daniel-smith-extra-fine-watercolor/
Undersea Green
http://danielsmith.com/undersea-green-5ml-tube-daniel-smith-extra-fine-watercolor/
Green Gold
http://danielsmith.com/green-gold-5ml-tube-daniel-smith-extra-fine-watercolor/
Rich Green Gold
http://danielsmith.com/rich-green-gold-5ml-tube-daniel-smith-extra-fine-watercolor/
Nickel Azo Yellow
http://danielsmith.com/nickel-azo-yellow-5ml-tube-daniel-smith-extra-fine-watercolor/
Yellow Ochre
http://danielsmith.com/yellow-ochre-5ml-tube-daniel-smith-extra-fine-watercolor/
Raw Sienna
http://danielsmith.com/raw-sienna-5ml-tube-daniel-smith-extra-fine-watercolor/
Quinacridone Gold
http://danielsmith.com/quinacridone-gold-5ml-tube-daniel-smith-extra-fine-watercolor/
Quinacridone Deep Gold
http://danielsmith.com/quinacridone-deep-gold-5ml-tube-daniel-smith-extra-fine-watercolor/
Quinacridone Burnt Orange
http://danielsmith.com/quinacridone-burnt-orange-5ml-tube-daniel-smith-extra-fine-

In [6]:
#move into dataframe
colors_df = pd.DataFrame(colors, columns = ['name','link'])

In [7]:
#remove perlecent and irridecent colors
extra_fine = colors_df[colors_df['link'].str.contains("extra")]

In [8]:
#reset the index
extra_fine.reset_index(drop = True, inplace=True)

In [11]:
#save only the extra_fine
extra_fine.to_csv("/Users/macbook/Box/git_hub/Insight_Project_clean/data/extra_fine.csv")

In [None]:
#save all colors
colors_df.to_csv("/Users/macbook/Box/git_hub/Insight_Project_clean/data/color_links.csv")

## Using the links above scrape the image data and pigment information

In [15]:
import re

In [16]:
pigments = []

In [17]:
for i in range(0, len(extra_fine)):
    name = extra_fine.name[i]
    url = extra_fine.link[i]
    #get the url
    webpage_response = requests.get(url, "html.parser")
    soup = BeautifulSoup(webpage_response.content)
    #get the image url
    meta_image = soup.find(property="og:image")
    image_url = meta_image['content']
    #get the pigment information
    info = soup.find(string=re.compile("Pigment")).string.strip().split("|")[0]
    light = soup.find(string=re.compile("Lightfastness")).strip()
    trans = soup.find(string=re.compile("Transparency")).strip()
    staining = soup.find(string=re.compile("Staining")).strip()
    gran = soup.find(string=re.compile("Granulating")).strip()
    pig_num = len(info.split(','))
    pigments.append([name,url,image_url,info,pig_num,light,trans,staining,gran])
    
    

In [None]:
#add information to dataframe and save as csv
pigment_complete = pd.DataFrame(pigments, columns = ['name','url','image_url','pigments','pigment_num','lightfastness','transparency','staining','granulation'])
pigment_complete.to_csv("/Users/macbook/Box/git_hub/Insight_Project_clean/data/pigment_complete.csv")

## Download the images and save to local drive


In [18]:
pigment = pd.read_csv("/Users/macbook/Box/git_hub/Insight_Project_clean/data/pigment_complete.csv")

In [19]:
paths = []

In [None]:
for i in range(0,len(pigment)):
    color_name = pigment.name[i]
    #save the files with lower case names and replace spaces with snake syntax
    file_path = "/Users/macbook/Box/insight_project_data/swatches/ds/" + str(pigment.name[i].translate(str.maketrans({' ':'_', '(':'_',')':'' })).lower()) + ".jpg"
    paths.append([color_name, file_path])

In [None]:
#Convert to df and save as csv
paths_df = pd.DataFrame(paths, columns = ["name", "file_path"])
paths_df.to_csv("/Users/macbook/Box/git_hub/Insight_Project_clean/data/paths.csv")

In [None]:
#download each image using the modified pigment name 
for i in range(0, len(pigment)):
    filename = str(pigment.name[i].translate(str.maketrans({' ':'_', '(':'_',')':'' })).lower())+".jpg"
    image_url = pigment.image_url[i]
    with open(filename, 'wb') as handle:
            response = requests.get(image_url, stream=True)

            if not response.ok:
                print (response)

            for block in response.iter_content(1024):
                if not block:
                    break

                handle.write(block)

# Cropping images
each image is 400 X 400 
bounding box = 210,42,277,202

In [None]:
crop_path = []

In [20]:
#crops each full image and saves a small cropped version
for i in range(0,len(paths_df)):
    im_path = paths_df.file_path[i]
    save_path = "/Users/macbook/Box/insight_project_data/swatches/ds_cropped/" + str(pigment.name[i].translate(str.maketrans({' ':'_', '(':'_',')':'' })).lower()) + "_crop.jpg"
    paths.append([color_name, file_path])
    im = Image.open(im_path).crop((215,42,277,150))
    im.save(save_path)
    crop_path.append(save_path)

NameError: name 'paths_df' is not defined

In [None]:
#append the path to the cropped image to the larger df and save as csv
paths_df["crop_path"] = crop_path
paths_df.to_csv("/Users/macbook/Box/git_hub/Insight_Project_clean/data/paths_df.csv")

In [None]:
#for each color generate a numberic label to be used in clustering later on and to be used to connect clustered colors to SQL
paths_df['label'] = [x+1 for x in range(0,len(paths_df))]

In [None]:
paths_df.to_csv("/Users/macbook/Box/git_hub/Insight_Project_clean/data/paths_df.csv")