In [1]:
import re
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [8]:
#pull in website source code
prefix = 'https://content.codecademy.com/courses/beautifulsoup/'
url = 'https://content.codecademy.com/courses/beautifulsoup/shellter.html'

page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [56]:
#extract each turtle's link and each link into a list
turtle_links = soup.find_all('a')
links = []

#go through each link from turtle_links and add 'href' to create each turtle's page's link
for turtle_link in turtle_links:
    links.append(prefix + turtle_link['href'])
    
#create turtle_data dictionary with turtle_name as the key and info as the values
turtle_data = {}

for link in links:
    webpage = requests.get(link)
    turtle = BeautifulSoup(webpage.text, 'html.parser')
    ##after looking at the source code of individual turtle's page, we find that turtle names are linked with tag 'p'
    ## and attribute class=name
    turtle_name = turtle.find('p', attrs = {'class': 'name'}).get_text()
    
    ## same concept for each turtle's information
    turtle_data_values = []
    infos = turtle.find('ul')
    for info in infos.find_all('li'):
        turtle_data_values.append(info.get_text())
        
    turtle_data[turtle_name] = turtle_data_values
    
print(turtle_data)

{'Aesop': ['AGE: 7 Years Old', 'WEIGHT: 6 lbs', 'SEX: Female', 'BREED: African Aquatic Sideneck Turtle', 'SOURCE: found in Lake Erie'], 'Caesar': ['AGE: 2 Years Old', 'WEIGHT: 4 lbs', 'SEX: Male', 'BREED: Greek Tortoise', 'SOURCE: hatched in house'], 'Sulla': ['AGE: 1 Year Old', 'WEIGHT: 1 lb', 'SEX: Male', 'BREED: African Aquatic Sideneck Turtle', 'SOURCE: found in Lake Erie'], 'Spyro': ['AGE: 6 Years Old', 'WEIGHT: 3 lbs', 'SEX: Female', 'BREED: Greek Tortoise', 'SOURCE: hatched in house'], 'Zelda': ['AGE: 3 Years Old', 'WEIGHT: 2 lbs', 'SEX: Female', 'BREED: Eastern Box Turtle', 'SOURCE: surrendered by owner'], 'Bandicoot': ['AGE: 2 Years Old', 'WEIGHT: 2 lbs', 'SEX: Male', 'BREED: African Aquatic Sideneck Turtle', 'SOURCE: hatched in house'], 'Hal': ['AGE: 1 Year Old', 'WEIGHT: 1.5 lbs', 'SEX: Female', 'BREED: Eastern Box Turtle', 'SOURCE: surrendered by owner'], 'Mock': ['AGE: 10 Years Old', 'WEIGHT: 10 lbs', 'SEX: Male', 'BREED: Greek Tortoise', 'SOURCE: surrendered by owner'], '

In [29]:
# construct a DataFrame with the turtle_data dictionary
turtle_df = pd.DataFrame.from_dict(turtle_data)
print(turtle_df)

                                    Aesop                    Caesar  \
0                        AGE: 7 Years Old          AGE: 2 Years Old   
1                           WEIGHT: 6 lbs             WEIGHT: 4 lbs   
2                             SEX: Female                 SEX: Male   
3  BREED: African Aquatic Sideneck Turtle     BREED: Greek Tortoise   
4              SOURCE: found in Lake Erie  SOURCE: hatched in house   

                                    Sulla                     Spyro  \
0                         AGE: 1 Year Old          AGE: 6 Years Old   
1                            WEIGHT: 1 lb             WEIGHT: 3 lbs   
2                               SEX: Male               SEX: Female   
3  BREED: African Aquatic Sideneck Turtle     BREED: Greek Tortoise   
4              SOURCE: found in Lake Erie  SOURCE: hatched in house   

                          Zelda                               Bandicoot  \
0              AGE: 3 Years Old                        AGE: 2 Years Old

In [57]:
#switch the row and column names with .transpose()
final_df = turtle_df.transpose()
print(final_df)

                            0                1            2  \
Aesop        AGE: 7 Years Old    WEIGHT: 6 lbs  SEX: Female   
Caesar       AGE: 2 Years Old    WEIGHT: 4 lbs    SEX: Male   
Sulla         AGE: 1 Year Old     WEIGHT: 1 lb    SEX: Male   
Spyro        AGE: 6 Years Old    WEIGHT: 3 lbs  SEX: Female   
Zelda        AGE: 3 Years Old    WEIGHT: 2 lbs  SEX: Female   
Bandicoot    AGE: 2 Years Old    WEIGHT: 2 lbs    SEX: Male   
Hal           AGE: 1 Year Old  WEIGHT: 1.5 lbs  SEX: Female   
Mock        AGE: 10 Years Old   WEIGHT: 10 lbs    SEX: Male   
Sparrow    AGE: 1.5 Years Old  WEIGHT: 4.5 lbs  SEX: Female   

                                                3  \
Aesop      BREED: African Aquatic Sideneck Turtle   
Caesar                      BREED: Greek Tortoise   
Sulla      BREED: African Aquatic Sideneck Turtle   
Spyro                       BREED: Greek Tortoise   
Zelda                   BREED: Eastern Box Turtle   
Bandicoot  BREED: African Aquatic Sideneck Turtle  

In [58]:
#change column names into info categories
final_df.columns = ['age', 'weight', 'sex', 'breed', 'source']
print(final_df)

                          age           weight          sex  \
Aesop        AGE: 7 Years Old    WEIGHT: 6 lbs  SEX: Female   
Caesar       AGE: 2 Years Old    WEIGHT: 4 lbs    SEX: Male   
Sulla         AGE: 1 Year Old     WEIGHT: 1 lb    SEX: Male   
Spyro        AGE: 6 Years Old    WEIGHT: 3 lbs  SEX: Female   
Zelda        AGE: 3 Years Old    WEIGHT: 2 lbs  SEX: Female   
Bandicoot    AGE: 2 Years Old    WEIGHT: 2 lbs    SEX: Male   
Hal           AGE: 1 Year Old  WEIGHT: 1.5 lbs  SEX: Female   
Mock        AGE: 10 Years Old   WEIGHT: 10 lbs    SEX: Male   
Sparrow    AGE: 1.5 Years Old  WEIGHT: 4.5 lbs  SEX: Female   

                                            breed  \
Aesop      BREED: African Aquatic Sideneck Turtle   
Caesar                      BREED: Greek Tortoise   
Sulla      BREED: African Aquatic Sideneck Turtle   
Spyro                       BREED: Greek Tortoise   
Zelda                   BREED: Eastern Box Turtle   
Bandicoot  BREED: African Aquatic Sideneck Turtle  

In [60]:
#change values in age column from string to integer
#\d+ means to extract 1 or more digits
final_df['age'] = final_df.age.str.extract('(\d+)').apply(pd.to_numeric)
print(final_df)

           age           weight          sex  \
Aesop        7    WEIGHT: 6 lbs  SEX: Female   
Caesar       2    WEIGHT: 4 lbs    SEX: Male   
Sulla        1     WEIGHT: 1 lb    SEX: Male   
Spyro        6    WEIGHT: 3 lbs  SEX: Female   
Zelda        3    WEIGHT: 2 lbs  SEX: Female   
Bandicoot    2    WEIGHT: 2 lbs    SEX: Male   
Hal          1  WEIGHT: 1.5 lbs  SEX: Female   
Mock        10   WEIGHT: 10 lbs    SEX: Male   
Sparrow      1  WEIGHT: 4.5 lbs  SEX: Female   

                                            breed  \
Aesop      BREED: African Aquatic Sideneck Turtle   
Caesar                      BREED: Greek Tortoise   
Sulla      BREED: African Aquatic Sideneck Turtle   
Spyro                       BREED: Greek Tortoise   
Zelda                   BREED: Eastern Box Turtle   
Bandicoot  BREED: African Aquatic Sideneck Turtle   
Hal                     BREED: Eastern Box Turtle   
Mock                        BREED: Greek Tortoise   
Sparrow    BREED: African Aquatic Sideneck

In [61]:
#change values in weight column from string to integer
final_df['weight'] = final_df.weight.str.extract('(\d+)').apply(pd.to_numeric)
print(final_df)

           age  weight          sex                                   breed  \
Aesop        7       6  SEX: Female  BREED: African Aquatic Sideneck Turtle   
Caesar       2       4    SEX: Male                   BREED: Greek Tortoise   
Sulla        1       1    SEX: Male  BREED: African Aquatic Sideneck Turtle   
Spyro        6       3  SEX: Female                   BREED: Greek Tortoise   
Zelda        3       2  SEX: Female               BREED: Eastern Box Turtle   
Bandicoot    2       2    SEX: Male  BREED: African Aquatic Sideneck Turtle   
Hal          1       1  SEX: Female               BREED: Eastern Box Turtle   
Mock        10      10    SEX: Male                   BREED: Greek Tortoise   
Sparrow      1       4  SEX: Female  BREED: African Aquatic Sideneck Turtle   

                                 source  
Aesop        SOURCE: found in Lake Erie  
Caesar         SOURCE: hatched in house  
Sulla        SOURCE: found in Lake Erie  
Spyro          SOURCE: hatched in house  

In [62]:
#by looking at the data type of sex, breed, source columns, we find that they are series instead of strings
#use .astype('string') to change their type to strings
final_df['sex'] = final_df['sex'].astype("string")
final_df['breed'] = final_df['breed'].astype("string")
final_df['source'] = final_df['source'].astype('string')
print(final_df)

           age  weight          sex                                   breed  \
Aesop        7       6  SEX: Female  BREED: African Aquatic Sideneck Turtle   
Caesar       2       4    SEX: Male                   BREED: Greek Tortoise   
Sulla        1       1    SEX: Male  BREED: African Aquatic Sideneck Turtle   
Spyro        6       3  SEX: Female                   BREED: Greek Tortoise   
Zelda        3       2  SEX: Female               BREED: Eastern Box Turtle   
Bandicoot    2       2    SEX: Male  BREED: African Aquatic Sideneck Turtle   
Hal          1       1  SEX: Female               BREED: Eastern Box Turtle   
Mock        10      10    SEX: Male                   BREED: Greek Tortoise   
Sparrow      1       4  SEX: Female  BREED: African Aquatic Sideneck Turtle   

                                 source  
Aesop        SOURCE: found in Lake Erie  
Caesar         SOURCE: hatched in house  
Sulla        SOURCE: found in Lake Erie  
Spyro          SOURCE: hatched in house  

In [65]:
#clean the values of sex, breed, and source columns
final_df['sex'] = final_df['sex'].str.split().str[-1]
final_df['breed'] = final_df['breed'].str.split(':').str[-1]
final_df['source'] = final_df['source'].str.split(':').str[-1]
print(final_df)

           age  weight     sex                             breed  \
Aesop        7       6  Female   African Aquatic Sideneck Turtle   
Caesar       2       4    Male                    Greek Tortoise   
Sulla        1       1    Male   African Aquatic Sideneck Turtle   
Spyro        6       3  Female                    Greek Tortoise   
Zelda        3       2  Female                Eastern Box Turtle   
Bandicoot    2       2    Male   African Aquatic Sideneck Turtle   
Hal          1       1  Female                Eastern Box Turtle   
Mock        10      10    Male                    Greek Tortoise   
Sparrow      1       4  Female   African Aquatic Sideneck Turtle   

                          source  
Aesop         found in Lake Erie  
Caesar          hatched in house  
Sulla         found in Lake Erie  
Spyro           hatched in house  
Zelda       surrendered by owner  
Bandicoot       hatched in house  
Hal         surrendered by owner  
Mock        surrendered by owner  
Spar