In [None]:
import configparser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import time

# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Directory where CSV files will be downloaded
csv_directory = config['files']['enhanced_loan_level_dir']

# Selenium setup for web scraping and file download
chrome_options = webdriver.ChromeOptions()
prefs = {"download.default_directory": csv_directory}
chrome_options.add_experimental_option("prefs", prefs)
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Define the URL of the website and navigate to it
url = 'https://sf.citidirect.com/stfin/index.html'
driver.get(url)


# Wait until the ready state is complete
WebDriverWait(driver, 60).until(lambda d: d.execute_script('return document.readyState') == 'complete')

# Switch to the frame that contains the 'MBS' link
driver.switch_to.frame("left")

# Wait for the 'MBS' link to be clickable and click it
try:
    mbs_link = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.ID, 'MBS')))
    mbs_link.click()
except TimeoutException:
    print("Timed out waiting for the 'MBS' link to be clickable.")
    driver.quit()
    exit()

# Switch back to the main content and then to the frame that contains the '2006-AMC1' link
driver.switch_to.default_content()
try:
    WebDriverWait(driver, 60).until(EC.frame_to_be_available_and_switch_to_it((By.NAME, "main")))
    print("Switched to main frame.")
except TimeoutException:
    print("Timed out waiting for the main frame to be available.")
    driver.quit()
    exit()

# Click the '2006-AMC1' link
try:
    link_2006_AMC1 = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, "//a[normalize-space(.)='2006-AMC1']")))
    link_2006_AMC1.click()
except TimeoutException:
    print("Timed out waiting for the '2006-AMC1' link to be clickable.")
    driver.quit()
    exit()

# Function to get the most recently downloaded file
def get_latest_downloaded_file(download_dir):
    # Get list of files in the directory sorted by modified time
    files = [os.path.join(download_dir, f) for f in os.listdir(download_dir)]
    files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
    if files:
        return files[0]
    return None

# Set up the XPath pattern to match the PDF link for each month of a specific year
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# XPath pattern to match the PDF link for each month taking into account the preceding sibling with year
years = [str(year) for year in range(2007, 2024)]

for year in years:
    for month in months:
        xpath = f"//td[preceding-sibling::td[contains(., '{year}')]]" \
        f"/a[translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='nodec1bold' " \
        f"and contains(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'loandetailcml') " \
        f"and contains(., '{month}')]"
        try:
            # Wait for the link to be clickable and click it
            month_link = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, xpath)))
            month_link.click()
            print(f"Clicked on the link for {month} {year}")
           
        except TimeoutException:
            print(f"Could not find the clickable link for {month} {year}")
        except NoSuchElementException:
            print(f"Could not find the link for {month} {year}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            driver.quit()
            exit()
        time.sleep(10)  # Wait for download to complete
       
driver.quit()

print("Extraction complete.")
