# ETF Weight Scraping
This is to retrieve and compile portfolio weights from ETFs where this information is not released as .csv or .xls file. Useful to verify the verify the total asset allocation across the holistic portfolio.

## Documentation
https://docs.python.org/3/library/http.cookiejar.html <br>
https://docs.python.org/3/library/urllib.request.html#module-urllib.request <br>
https://stackabuse.com/guide-to-parsing-html-with-beautifulsoup-in-python/ <br>
https://docs.python-requests.org/en/latest/ <br> <br>
#### Selenium
https://selenium-python.readthedocs.io/installation.html <br>
https://www.selenium.dev/documentation/webdriver/elements/finders/ <br>
https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.support.expected_conditions


## Import 

In [143]:
import pandas as pd
import numpy as np
import os
import requests
import json
from bs4 import BeautifulSoup
from http.cookiejar import CookieJar
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

## Function to extract the source code from a given page
1. Supply the url of the table location and test the connection, go past the disclaimer pages
2. Make sure the entire content of the page is displayed, check J script on the page for this
3. Save the code as a flat file or html file or whatever works

In [9]:
def url_scrap(url, path):
    """
    Saves the full content of a web page on the drive
    """
    # Getting the source code, it will be held in r.text
    r = requests.get(url)
    
    # Saving 
    page_save = open(path, "w")
    json.dump(r.text, page_save)
    
    print("File saved at " + path)


In [10]:
url = "https://fundcentres.lgim.com/uk/en/fund-centre/PMC/World-Equity-Index-Fund"
path = os.getcwd() + "/World-Equity-Index-Fund.html"
url_scrap(url,path)

## Finding a way to bypass the disclaimer pages

In [29]:
cj = CookieJar()
r = requests.get(url, cookies = cj)

In [31]:
r.cookies

<RequestsCookieJar[Cookie(version=0, name='csrftoken', value='kAXcLwzrA907fFXW8U42lKrPrVbXD814EHKCZzC6HWQCULM9cyoEtV5hYVihNYO0', port=None, port_specified=False, domain='fundcentres.lgim.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=True, expires=1672231083, discard=False, comment=None, comment_url=None, rest={'HttpOnly': None, 'SameSite': 'Lax'}, rfc2109=False), Cookie(version=0, name='sessionid', value='8qqfxo5wouqwyhrafe1268kg2kjbbkhp', port=None, port_specified=False, domain='fundcentres.lgim.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=True, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None, 'SameSite': 'lax'}, rfc2109=False)]>

### Trying selenium

In [153]:
driver = webdriver.Chrome()
driver.get(url) # It works

In [154]:
cookie_accept = driver.find_element(By.ID, "onetrust-accept-btn-handler")
cookie_accept.click() # It works

In [155]:
terms_tick = driver.find_element(By.ID, "popup-checkbox-$tools.math.add($velocityCount,-1)")
terms_tick.click() # It works

%%html
<img src="img/l&g-accept-button.png">

In [88]:
# To identify the correct naming of the accept button
search_button = [driver.find_elements(By.CLASS_NAME, "btn btn-secondary btn-accept"),
driver.find_elements(By.CLASS_NAME, "btn-btn-secondarybtn-accept"),
 driver.find_elements(By.CLASS_NAME, "btn.btn-secondary.btn-accept"),
driver.find_elements(By.NAME, "Accept")]
search_button

[[],
 [],
 [<selenium.webdriver.remote.webelement.WebElement (session="2f86245a8495d1e05408a01a512ea580", element="2ab686ed-ce93-4781-bef5-262411e49975")>],
 []]

In [94]:
# Now there's an issue where the click doesn't work because we're now dealing with a list
type(driver.find_elements(By.CLASS_NAME, "btn.btn-secondary.btn-accept"))

list

In [156]:
# Trying the find_element approach
terms_accept = driver.find_element(By.CLASS_NAME, "btn.btn-secondary.btn-accept") # It works!
terms_accept.click() # Now working

In [129]:
# Trying with Action Chains
webdriver.ActionChains(driver).move_to_element(terms_accept).click(terms_accept).perform()

In [147]:
# Testing expected conditions from webdriver
cookie_accept = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler")))
print(cookie_accept)

<selenium.webdriver.remote.webelement.WebElement (session="40a71adf8f7bd8ac07c5881a281fbffd", element="84d9fcec-4a82-431a-bcb8-fdb6f1c13f03")>


The selenium stuff works but now the requests.get request to download the page does not use the same environment/approach so there is no benefit to the code until either 1) the requests.get receives the cookies from selenium or 2) you download the data directly from selenium

In [160]:
# Trying option 2 straight away
html = driver.page_source

In testing the above works fine but with the combined function we end up downloading the terms and conditions again. So we need another wait

In [173]:
# Adding a wait until the disclaimer is gone
WebDriverWait(driver,10).until(EC.staleness_of(terms_accept))
html = driver.page_source

# Testing if the error comes from the way the file is saved
path = os.getcwd() + "/World-Equity-Index-Fund.html"
page_save = open(path, "w")
page_save.write(html)
page_save.close()

### The combined function at the end

In [174]:
def url_scrap(url, path):
    """
    Saves the full content of a web page on the drive, bypassing disclaimer pages.
    Works only with Chrome and the following websites re disclaimers:
        - lgim.com
    Args:
        url : url to be scraped
        path : where to save the file on the drive
    """
    
    # Bypassing disclaimers on the page to get to the real content
    if url.find("lgim.com") > 0:
        driver = webdriver.Chrome() # TO DO - ADD A SELECTOR TO RECOGNISE THE BROWSER IN USE
        driver.get(url)
    
        # ADD ASSERTIONS FOR WHEN MORE FLEXIBLE
        
        # Cookies banner at the top of LGIM
        cookie_accept = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler")))
        cookie_accept.click()
        
        # Terms and conditions and confirmation we are professional investors
        terms_tick = driver.find_element(By.ID, "popup-checkbox-$tools.math.add($velocityCount,-1)")
        terms_tick.click()
        terms_accept = driver.find_element(By.CLASS_NAME, "btn.btn-secondary.btn-accept") 
        terms_accept.click()
    
    # Getting the source code
    WebDriverWait(driver,10).until(EC.staleness_of(terms_accept))
    print("wait complete") # DEBUG
    html = driver.page_source 
    driver.close() 
    
    # Saving 
    page_save = open(path, "w")
    page_save.write(html)
    page_save.close()
    
    print("File saved at " + path)

In [176]:
url = "https://fundcentres.lgim.com/uk/en/fund-centre/PMC/World-Equity-Index-Fund"
path = os.getcwd() + "/World-Equity-Index-Fund.html"
url_scrap(url,path)

wait complete
File saved at /Users/mariagevrey/Dropbox/Romaria/Studies/Python Training Investment Management/World-Equity-Index-Fund.html


## Function to list all of the HTML and CSS tags in the page and create a dataframe with the content

soup 

## Function to translate any HTML table into a dataframe

In [32]:
os.path()

TypeError: 'module' object is not callable