In [1]:
import requests
import bs4
import pandas as pd

In [2]:
def login_to_page(url, username, password):
    """Log in to the target website while properly handling the CSRF token.

    Args:
      - url: The url of the target webpage (OrangeCRM login page)
      - username: username as string
      - password: password as string
    
    Returns:
      A requests.Session object with the login already completed
    """
    
    session = requests.Session()

    # Get CSRF token
    login_page = session.get("https://opensource-demo.orangehrmlive.com")
    login_page.raise_for_status()
    login_soup = bs4.BeautifulSoup(login_page.content)
    csrf_token = login_soup.find("input", type="hidden", id="csrf_token")["value"]

    # Log in using supplied credentials
    login_data = {
        "txtUsername": "Admin",
        "txtPassword": "admin123",
        "_csrf_token": csrf_token
    }
    login_response = session.post(
        url + "/index.php/auth/validateCredentials",
        data=login_data
    )
    login_response.raise_for_status()

    return session

In [3]:
def parse_page(session, url):
    """Parse a given page on an OrangeCRM website containing a table.
    
    Args:
      - session: a requests.Session object with the user already signed in
      - url: the url of the page to scrape

    Returns: a pandas.DataFrame containing the relevant data
    """

    # We don't have to deal with the login in this function (separation of concerns)
    response = session.get(url)
    response.raise_for_status()

    candidates_soup = bs4.BeautifulSoup(response.content)

    table_head = candidates_soup.find("table", id="resultTable").find("thead")
    table_body = candidates_soup.find("table", id="resultTable").find("tbody")

    colnames = [col.text for col in table_head.find_all("th")]

    data = []
    for row in table_body.find_all("tr"):
        data.append([col.text for col in row.find_all("td")])

    return pd.DataFrame(data, columns=colnames)

In [4]:
session = login_to_page(
    url="https://opensource-demo.orangehrmlive.com",
    username="Admin",
    password="admin123"
)

In [5]:
candidates_df = parse_page(
    session=session,
    url="https://opensource-demo.orangehrmlive.com/index.php/recruitment/viewCandidates"
)

In [6]:
candidates_df.head()

Unnamed: 0,Unnamed: 1,Vacancy,Candidate,Hiring Manager,Date of Application,Status,Resume
0,,Associate IT Manager,Banda Pavithra R,Odis Adalwin,2021-09-14,Application Initiated,Download
1,,Associate IT Manager,Banda Pavithra R,Odis Adalwin,2021-09-14,Application Initiated,Download
2,,Associate IT Manager,Banda Pavithra R,Odis Adalwin,2021-09-14,Application Initiated,Download
3,,Associate IT Manager,pavitra B R,Odis Adalwin,2021-09-14,Application Initiated,Download
4,,Junior Account Assistant,maren ibis salamo,Kevin Mathews,2021-09-14,Application Initiated,Download
