In [1]:
# Dependencies
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from sqlalchemy import create_engine
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
engine = create_engine('sqlite://', echo=False)

In [3]:
# Returns list of all breeds from 'Dogtime' 
def get_breeds():
    temp = []
    url = 'https://dogtime.com/dog-breeds/profiles'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    results = soup.find_all('div', class_="list-item")
    for result in results:
        try:
            breed = result.find('a', class_="list-item-title").text
            if (breed):
                temp.append(breed)
        except AttributeError as e:
            print(e)
    return temp

In [4]:
# Returns attributes of individual breed
def get_attributes(breed):
    try:
        url = f"https://www.akc.org/dog-breeds/{breed}/"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        retreiver = soup.find('div', class_='panel-flex__aside')

        attributes = []
        for list_ in retreiver.ul:
            for attr in list_:
                try:
                    attributes.append(attr.text.strip())
                except:
                    attributes.append(attr.strip())

        attributes  = [a for a in attributes if a is not '']

        keys = [a for a in attributes if attributes.index(a) % 2 == 0]
        keys = [x[:-1] for x in keys]
        values = [a for a in attributes if attributes.index(a) % 2 != 0]
        dictionary = dict(zip(keys, values))

        # Add grooming frequency
        groom = soup.find('div', class_="bar-graph__text").text
        groomkey = {'Grooming frequency': groom}
        dictionary.update(groomkey)

        # Add shedding measure
        shed = soup.find_all('div', class_="bar-graph__text")
        shed = shed[(len(shed)-4)]
        shed = str(shed)
        shed = shed[29:-6]
        shedkey = {'Shedding': shed}
        dictionary.update(shedkey)

        # Add trainability measure
        train = soup.find_all('div', class_="bar-graph__text")
        train = train[(len(train)-2)]
        train = str(train)
        train = train[29:-6]
        trainkey = {'Trainability': train}
        dictionary.update(trainkey)

        # Add activity measure
        activity = soup.find_all('div', class_="bar-graph__text")
        activity = activity[(len(activity)-3)]
        activity = str(activity)
        activity = activity[29:-6]
        activitykey = {'Energy level': activity}
        dictionary.update(activitykey)

        # Clean up & add breed name
        breedvalue = str(breed)
        breedvalue = breedvalue.replace("-", " ")
        breedkey = {'Breed' : breedvalue.title()}
        dictionary.update(breedkey)

        # Clean up & add url for image of breed
        img = soup.find('div', class_="basic-slider__inner").find("img")
        pupurl = (str(img).split("src="))
        link = [x[1:-3] for x in pupurl]
        image = link[1]
        linkkey = {'Image URL' : image}
        dictionary.update(linkkey)

        output = [dictionary]
        return output
    
    except AttributeError as e:
        print(e)
        print(f"Could not find page for: {breed.title()}")

In [5]:
# On the backburner for now, will code later
# def get_funfact(breed):
#     url = f"https://www.akc.org/dog-breeds/{breed}/"
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, "html.parser")
#     retreiver = soup.find('div', class_='fact-slider__slide-content')
    
#     return funfact

In [6]:
# Filling dataframe using functions get_breeds and get_attributes
df = pd.DataFrame()
breeds = get_breeds()
ready_breeds = [e.replace(" ", "-") for e in breeds]
ready_breeds = [e.lower() for e in ready_breeds]
for pup in ready_breeds:
    df = df.append(get_attributes(pup))

'NoneType' object has no attribute 'ul'
Could not find page for: Alaskan-Klee-Kai
'NoneType' object has no attribute 'ul'
Could not find page for: American-Bulldog
'NoneType' object has no attribute 'ul'
Could not find page for: American-Pit-Bull-Terrier


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


'NoneType' object has no attribute 'ul'
Could not find page for: Basset-Hound
'NoneType' object has no attribute 'ul'
Could not find page for: Bernedoodle
'NoneType' object has no attribute 'ul'
Could not find page for: Black-Mouth-Cur
'NoneType' object has no attribute 'ul'
Could not find page for: Blue-Lacy
'NoneType' object has no attribute 'ul'
Could not find page for: Cockapoo
'NoneType' object has no attribute 'ul'
Could not find page for: Fox-Terrier
'NoneType' object has no attribute 'ul'
Could not find page for: Goldador
'NoneType' object has no attribute 'ul'
Could not find page for: Goldendoodle
'NoneType' object has no attribute 'ul'
Could not find page for: Jack-Russell-Terrier
'NoneType' object has no attribute 'ul'
Could not find page for: Japanese-Spitz
'NoneType' object has no attribute 'ul'
Could not find page for: Korean-Jindo-Dog
'NoneType' object has no attribute 'ul'
Could not find page for: Labradoodle
'NoneType' object has no attribute 'text'
Could not find page

In [7]:
df = df.set_index("Breed")
df.head()
df.to_csv("..\Resources\dirtydatascraped.csv")