# Tutorial from: Web Scraping in Python with Beautiful Soup and Requests
# by Jan Kirenz
# https://www.kirenz.com/post/2022-05-02-web-scraping-in-python-with-beautiful-soup-requests-and-pandas/

## Call our Packages pandas, requests, and BeautifulSoup.  Load up our test url and get the results.

In [None]:
import pandas as pd

import requests
from bs4 import BeautifulSoup

url = 'http://quotes.toscrape.com/'

html = requests.get(url)

html

## How do we read the information we've scrapped?

In [None]:
soup = BeautifulSoup(html.text, 'html.parser')
print(soup.prettify())

### The title of the page may have data that we want to store.  How do we get this? 

In [None]:
soup.title

In [None]:
soup.title.name

In [None]:
soup.title.string

### We can also get the first link on the page by using the .a attribute

In [None]:
soup.a

### Now try the first text element in the code

In [None]:
soup.span.text

### The quotes appear to be divided by div tags titled by class type 'quote'

In [None]:
quotes = soup.find_all('div', {'class': 'quote'})

quotes

### Let's make the retrieval more legible 

In [None]:
quotes = soup.find_all('div', {'class': 'quote'})

for i in quotes:
    print((i.find('span', {'class':'text'})).text)

### Let's look for the authors

In [None]:
for i in soup.findAll("div",{"class": "quote"}):
    print((i.find("small", {"class": "author"})).text)

### Now the additional meta data on the page

In [None]:
for i in soup.findAll("div",{"class": "tags"}):
    print((i.find("meta"))['content'])

### All of this is great but how do we put this in a table?

In [None]:
# store root url without page number
root = 'http://quotes.toscrape.com/page/'

# create empty arrays
quotes = []
authors = []
tags = []

# loop over page 1 to 10
for pages in range(1,10): 
        
        html = requests.get(root + str(pages))
        
        soup = BeautifulSoup(html.text)    

        for i in soup.findAll("div",{"class":"quote"}):
                 quotes.append((i.find("span",{"class":"text"})).text)  
   
        for j in soup.findAll("div",{"class":"quote"}):
                 authors.append((j.find("small",{"class":"author"})).text)    
        
        for k in soup.findAll("div",{"class":"tags"}):
                 tags.append((k.find("meta"))['content'])

df = pd.DataFrame(
    {'Quotes':quotes,
     'Authors':authors,
     'Tags':tags
    })

### Print our Dataframe

In [None]:
df

In [None]:
AuthorQuotes = spark.createDataFrame(df)

In [None]:
AuthorQuotes.write.format("Delta").save("Tables/AuthorQuotes")