# Extract Transform Load Project
### Team Biscotti
---

In [None]:
# import dependacies
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pymongo
import requests
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy import func

# postgres pasword
import postgres_password from config as password

---
# Extract
---

### Web Scraping 
* Splinter
* Beautiful Soup
* Requests
* webdriver_manager

In [None]:
# create path and open browser window
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# establish url
url = 'https://quotes.toscrape.com/'

# visit site
browser.visit(url)

# grab page html
html = browser.html

# create soup object
soup = BeautifulSoup(html,'html.parser')

* Loop through pages

In [None]:
# create the initial list to store the data
quotes_list = []

# initialize "next" object to start while loop
Next = True

# loop to go thru pages while next button count is greater than zero
while Next==True :
    # grab page html
    html = browser.html
    # create soup object
    soup = BeautifulSoup(html,'html.parser')
    # isolate the quote boxes for scraping
    boxes = soup.find_all('div',class_='quote')
    
    # for loop to click "about author" > get data > back > get box data
    for box in boxes:
        # initialize the mini dictionary
        quote_mini = {}
      
        # identify where to click on "about author"
        target = box.a['href']
        
        # click "about author" button
        browser.links.find_by_href(target).click()
        
        # get page html
        html = browser.html
        
        # create a soup object
        soup = BeautifulSoup(html,'html.parser')
        
        # add values for author to the mini dict
        quote_mini['author_name'] = soup.find('h3',class_='author-title').text
        quote_mini['birth_date'] = soup.find('span',class_='author-born-date').text
        quote_mini['birth_place'] = soup.find('span',class_='author-born-location').text
        quote_mini['description'] = soup.find('div',class_='author-description').text.replace('\n', '')
        
        # click back button
        browser.back()
               
        ## Grab quote box values for mini dict
        
        # add quote to the mini dict
        quote_mini['quote_text'] = box.span.text     
        
        # remove extra spaces, commas, and new line text that is not needed
        quote_tags = box.div.text.replace('\n',',').split(',')
        do_not_want = ['','            Tags:','            ']
        
        # add quote to the mini dict
        quote_mini['quote_tags'] = [tag for tag in quote_tags if tag not in do_not_want]
        
        # append completed mini dict to the quotes list
        quotes_list.append(quote_mini)
                
    # look for next button true/false for while loop condition        
    if browser.links.find_by_text('Next '):
        Next = True
        # click next button to move to next page 
        browser.links.find_by_text('Next ').click()
    else:
        # if no next button end loop
        Next = False

# if initialized at the begining of loop do we need at teh end?
#    html = browser.html
#    soup = BeautifulSoup(html,'html.parser')
#    boxes = soup.find_all('div',class_='quote')

# quite browser session and driver
browser.quit()

* Check results of web scraping

In [None]:
# check web scraping results
quotes_list[-11:]

In [None]:
# check author info web scraping results
for i in quotes_list:
    print(i['author_name'])

---
# Load
---

### Non-relational Database
* MongoDB
* Pymongo

In [None]:
# create connection to mongo database
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# create new data base
db = client.quotes_db

# drop the collection if it already exists
db.quotes.drop()

# instert our list of mini dictionaries from scraping
db.quotes.insert_many(quotes_list)

* Check results of Load

In [None]:
# show all quotes in collection
list_of_quotes = list(db.quotes.find())
print(list_of_quotes)

---
# Transform
---

### Clean Data
* Pandas

In [None]:
# use list of dict to create dataframe
main_quote_df = pd.DataFrame(list_of_quotes)

# drop mongo id field
main_quote_df = main_quote_df.drop(['_id'],axis=1)

# change index value to = quote id
main_quote_df['quote_id'] = main_quote_df.index

#check results
main_quote_df.head()

* Create Tags table

In [2]:
# Isolate tags df
tags = main_quote_df['quote_tags']

# break the tags list in each row into individal rows
tags = main_quote_df.explode('quote_tags')
tags = tags[['quote_tags','quote_id']]
tags = tags.set_index('quote_id')

# rename column to tags
tags.rename(columns = {'quote_tags':'tags'},inplace=True)
# check results
tags

NameError: name 'main_quote_df' is not defined

* Create Quotes Table

In [None]:
# isolate only relevant columns from main df
quotes_df = main_quote_df[['author_name', 'quote_text','quote_id']]

# eleminate index value by setting index to id#
quotes_df = quotes_df.set_index('quote_id')

# check results
quotes_df.head()

* Create Author Table

In [None]:
# isolate only relevant columns from main df
author_df = main_quote_df[['author_name','birth_date','birth_place','description']]

# drop duplicate authors
author_df = author_df.drop_duplicates(keep='first')

# check results
author_df.head()

---
# Load
---

### Relational database
* Pandas
* SQLAlchemy

In [None]:
# create engine to postgres
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/sql-challenge')

# use engine to connect to existing tables/db
Database = automap_base( )
Database.prepare(engine, reflect=True)

# View all of the classes/tables that automap found
Database.classes.keys( )

# Save references to each table (capital because they are considered classes) 
Tags = Database.classes.table_name
Quots = Database.classes.table_name
Author = Database.classes.table_name

# Create our session (link) from Python to the DB
session = Session(bind=engine)
inspector = inspect(engine)

# Use  get_columns in order write queries later
inspector.get_columns('table_name')