In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import os
import pandas as pd
import configparser
from sqlalchemy import create_engine

# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Set the new download directory path using raw string
download_dir = r"C:\Users\Rex Fuentes\Documents\citigroup_case_study\enhanced_lld2"

# Set preferences for Chrome Options
prefs = {"download.default_directory": download_dir}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("prefs", prefs)

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Define the URL of the website and navigate to it
url = 'https://sf.citidirect.com/stfin/index.html'
driver.get(url)

# Wait until the ready state is complete
WebDriverWait(driver, 60).until(lambda d: d.execute_script('return document.readyState') == 'complete')

# Switch to the frame that contains the 'MBS' link
driver.switch_to.frame("left")

# Wait for the 'MBS' link to be clickable and click it
try:
    mbs_link = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.ID, 'MBS')))
    mbs_link.click()
except TimeoutException:
    print("Timed out waiting for the 'MBS' link to be clickable.")
    driver.quit()
    exit()

# Switch back to the main content and then to the frame that contains the '2006-AMC1' link
driver.switch_to.default_content()
try:
    WebDriverWait(driver, 60).until(EC.frame_to_be_available_and_switch_to_it((By.NAME, "main")))
    print("Switched to main frame.")
except TimeoutException:
    print("Timed out waiting for the main frame to be available.")
    driver.quit()
    exit()

# Click the '2006-AMC1' link
try:
    link_2006_AMC1 = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, "//a[normalize-space(.)='2006-AMC1']")))
    link_2006_AMC1.click()
except TimeoutException:
    print("Timed out waiting for the '2006-AMC1' link to be clickable.")
    driver.quit()
    exit()

# Set up the XPath pattern to match the PDF link for each month of a specific year
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# XPath pattern to match the PDF link for each month taking into account the preceding sibling with year
years = [str(year) for year in range(2006, 2024)]

for year in years:
    for month in months:
        xpath = f"//td[preceding-sibling::td[contains(., '{year}')]]" \
        f"/a[translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='nodec1bold' " \
        f"and contains(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'loandetailcml') " \
        f"and contains(., '{month}')]"
        try:
            # Wait for the link to be clickable and click it
            month_link = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, xpath)))
            month_link.click()
            print(f"Clicked on the PDF link for {month} {year}")
           
        except TimeoutException:
            print(f"Could not find the clickable PDF link for {month} {year}")
        except NoSuchElementException:
            print(f"Could not find the link for {month} {year}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            driver.quit()
            exit()

# Close the browser after the operations are complete
driver.quit()

# ELLD

In [1]:
import configparser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import time

# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Directory where CSV files will be downloaded
csv_directory = config['files']['enhanced_loan_level_dir2']

# Selenium setup for web scraping and file download
chrome_options = webdriver.ChromeOptions()
prefs = {"download.default_directory": csv_directory}
chrome_options.add_experimental_option("prefs", prefs)
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Define the URL of the website and navigate to it
url = 'https://sf.citidirect.com/stfin/index.html'
driver.get(url)

# Web scraping code to download CSV files...
# Wait until the ready state is complete
WebDriverWait(driver, 60).until(lambda d: d.execute_script('return document.readyState') == 'complete')

# Switch to the frame that contains the 'MBS' link
driver.switch_to.frame("left")

# Wait for the 'MBS' link to be clickable and click it
try:
    mbs_link = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.ID, 'MBS')))
    mbs_link.click()
except TimeoutException:
    print("Timed out waiting for the 'MBS' link to be clickable.")
    driver.quit()
    exit()

# Switch back to the main content and then to the frame that contains the '2006-AMC1' link
driver.switch_to.default_content()
try:
    WebDriverWait(driver, 60).until(EC.frame_to_be_available_and_switch_to_it((By.NAME, "main")))
    print("Switched to main frame.")
except TimeoutException:
    print("Timed out waiting for the main frame to be available.")
    driver.quit()
    exit()

# Click the '2006-AMC1' link
try:
    link_2006_AMC1 = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, "//a[normalize-space(.)='2006-AMC1']")))
    link_2006_AMC1.click()
except TimeoutException:
    print("Timed out waiting for the '2006-AMC1' link to be clickable.")
    driver.quit()
    exit()

# Function to get the most recently downloaded file
def get_latest_downloaded_file(download_dir):
    # Get list of files in the directory sorted by modified time
    files = [os.path.join(download_dir, f) for f in os.listdir(download_dir)]
    files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
    if files:
        return files[0]
    return None

# Set up the XPath pattern to match the PDF link for each month of a specific year
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# XPath pattern to match the PDF link for each month taking into account the preceding sibling with year
years = [str(year) for year in range(2007, 2023)]

for year in years:
    for month in months:
        xpath = f"//td[preceding-sibling::td[contains(., '{year}')]]" \
        f"/a[translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='nodec1bold' " \
        f"and contains(translate(@href, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'loandetailcml') " \
        f"and contains(., '{month}')]"
        try:
            # Wait for the link to be clickable and click it
            month_link = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, xpath)))
            month_link.click()
            print(f"Clicked on the link for {month} {year}")
           
        except TimeoutException:
            print(f"Could not find the clickable link for {month} {year}")
        except NoSuchElementException:
            print(f"Could not find the link for {month} {year}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            driver.quit()
            exit()
        time.sleep(10)  # Wait for download to complete
       
driver.quit()

print("Extraction complete.")


Switched to main frame.
Could not find the clickable link for Jan 2007
Could not find the clickable link for Feb 2007
Could not find the clickable link for Mar 2007
Could not find the clickable link for Apr 2007
Could not find the clickable link for May 2007
Could not find the clickable link for Jun 2007
Clicked on the link for Jul 2007
Clicked on the link for Aug 2007
Clicked on the link for Sep 2007
Clicked on the link for Oct 2007


KeyboardInterrupt: 

In [None]:
 time.sleep(20)  # Wait for download to complete

        # Identify the most recently downloaded file
        downloaded_file = get_latest_downloaded_file(csv_directory)
        if downloaded_file:
            # Construct new filename with original name, month, and year
            file_base_name = os.path.basename(downloaded_file)
            new_filename = f"{os.path.splitext(file_base_name)[0]}_{month}_{year}{os.path.splitext(file_base_name)[1]}"
            new_file_path = os.path.join(csv_directory, new_filename)

            # Rename the file
            os.rename(downloaded_file, new_file_path)
        else:
            print(f"No file was downloaded for {month} {year}")
        


# SQLAlchemy connection URL
connection_url = f"postgresql://{config['database']['user']}:{config['database']['password']}@{config['database']['host']}:{config['database']['port']}/{config['database']['database']}"

# Create SQLAlchemy engine
engine = create_engine(connection_url)

# Function to extract month and year from filename
def extract_month_year(filename):
    # Assuming filename format is like 'data_January_2022.csv'
    parts = filename.split('_')
    if len(parts) >= 3:
        month = parts[1]
        year = parts[2].split('.')[0]  # Splitting to remove '.csv'
        return month, year
    return None, None

# Iterate over each CSV file in the csv_directory
for filename in os.listdir(csv_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(csv_directory, filename)

        # Read CSV into DataFrame
        df = pd.read_csv(file_path)

        # Extract month and year from filename and add as columns
        month, year = extract_month_year(filename)
        if month and year:
            df['month_listed'] = month
            df['year_listed'] = year

        # Load the DataFrame into PostgreSQL
        try:
            df.to_sql('loan_level_data', engine, if_exists='append', index=False)
            print(f"Loaded {filename} into the database.")
        except Exception as e:
            print(f"Error loading {filename}: {e}")



In [None]:
import pandas as pd
from sqlalchemy import create_engine
import os
import configparser

# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Database connection URL
db_connection_url = f"postgresql+psycopg2://{config['database']['user']}:{config['database']['password']}@{config['database']['host']}:{config['database']['port']}/{config['database']['database']}"

# Create a SQLAlchemy engine
engine = create_engine(db_connection_url)

# Directory where your CSV files are stored
csv_folder_path = config['files']['loan_level_data_download_dir']

# Iterate over each file in the folder and insert data into the database
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to load the CSV file
        df = pd.read_csv(file_path)
        
        # Add a new column with the filename
        df['filename'] = csv_file

        # Use 'to_sql' to insert the data into the database, it creates a table if it does not exist
        df.to_sql('loan_level_data', engine, if_exists='append', index=False)


# Using the function convert_column_types

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import os
import configparser
import numpy as np

def convert_column_types(df):
    for col in df.columns:
        # Attempt to convert each column to numeric, and if not possible, leave as is
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(df[col])
    return df

# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Database connection URL
db_connection_url = f"postgresql+psycopg2://{config['database']['user']}:{config['database']['password']}@{config['database']['host']}:{config['database']['port']}/{config['database']['database']}"

# Create a SQLAlchemy engine
engine = create_engine(db_connection_url)

# Directory where your CSV files are stored
csv_folder_path = config['files']['loan_level_data_download_dir']

# Iterate over each file in the folder and insert data into the database
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)
        df = pd.read_csv(file_path)

        # Convert data types and handle NaN values
        df = convert_column_types(df)
        df = df.where(pd.notnull(df), None)

        # Add a new column with the filename
        df['source_filename'] = csv_file

        # Insert data into the database
        try:
            df.to_sql('loan_level_data', engine, if_exists='append', index=False, method='multi')
        except Exception as e:
            print(f"Error inserting file {csv_file}: {e}")

# Close the engine
engine.dispose()


In [None]:
import pandas as pd
from sqlalchemy import create_engine
import os
import configparser

# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Database connection URL
db_connection_url = f"postgresql+psycopg2://{config['database']['user']}:{config['database']['password']}@{config['database']['host']}:{config['database']['port']}/{config['database']['database']}"

# Create a SQLAlchemy engine
engine = create_engine(db_connection_url)

# Directory where your CSV files are stored
csv_folder_path = config['files']['enhanced_loan_level_dir']

# Iterate over each file in the folder and insert data into the database
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to load the CSV file, treating all data as text
        df = pd.read_csv(file_path)

        # Use 'to_sql' to insert the data into the database, it creates a table if it does not exist
        df.to_sql('enhanced_loan_level_dir', engine, if_exists='append', index=False)


In [None]:
import pandas as pd
import os

# Directory where your CSV files are stored
csv_folder_path = r"C:\Users\Rex Fuentes\Documents\citigroup_case_study\enhanced_loan_level_data"  

# Column name to check
column_name = "Beginning Deferred Balance"

# List to store names of files containing the column
files_with_column = []

# Iterate over each file in the folder
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to read the first row of the CSV file
        df = pd.read_csv(file_path, nrows=1)

        # Check if the column exists
        if column_name in df.columns:
            files_with_column.append(csv_file)

# Display the list of files that contain the column
print("Files containing the column '{}':".format(column_name))
for file in files_with_column:
    print(file)


In [None]:
import pandas as pd
import os

# Directory where your CSV files are stored
csv_folder_path = r"C:\Users\Rex Fuentes\Documents\citigroup_case_study\enhanced_loan_level_data"  

# Column name to check
column_name = "Beginning Deferred Balance"

# Lists to store names of files containing the column
files_with_column = []
files_with_empty_column = []

# Iterate over each file in the folder
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to read the CSV file
        df = pd.read_csv(file_path)

        # Check if the column exists
        if column_name in df.columns:
            files_with_column.append(csv_file)

            # Check if the column is entirely empty
            if df[column_name].isnull().all():
                files_with_empty_column.append(csv_file)

# Display the list of files that contain the column
print("Files containing the column '{}':".format(column_name))
for file in files_with_column:
    print(file)

# Display the list of files where the column is entirely empty
print("\nFiles with an entirely empty column '{}':".format(column_name))
for file in files_with_empty_column:
    print(file)


In [None]:
import pandas as pd
import os

# Directory where your CSV files are stored
csv_folder_path = r"C:\Users\Rex Fuentes\Documents\citigroup_case_study\enhanced_loan_level_data"

# Column name to check
column_name = "Ending Deferred Balance"

# Variable to keep track of the column's emptiness across all files
is_column_empty_in_all_files = True

# Iterate over each file in the folder
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to read the CSV file
        df = pd.read_csv(file_path)

        # Check if the column exists
        if column_name in df.columns:
            # Check if the column is entirely empty
            if not df[column_name].isnull().all():
                is_column_empty_in_all_files = False
                break  # No need to check further if one file has non-empty values

# Display the result
if is_column_empty_in_all_files:
    print(f"All files with the column '{column_name}' have it entirely empty.")
else:
    print(f"There are files with non-empty values in the '{column_name}' column.")


In [None]:
import pandas as pd
import os
from collections import defaultdict

# Directory where your CSV files are stored
csv_folder_path = r"C:\Users\Rex Fuentes\Documents\citigroup_case_study\enhanced_loan_level_data"

# Dictionary to keep track of empty columns across all files
empty_columns = defaultdict(lambda: True)  # Assume all columns are empty initially

# Iterate over each file in the folder
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to read the CSV file
        df = pd.read_csv(file_path)

        # Check each column in the DataFrame
        for column in df.columns:
            # If a column is found to be non-empty in any file, mark it as such
            if empty_columns[column] and not df[column].isnull().all():
                empty_columns[column] = False

# Filter out the columns that are not empty in any file
entirely_empty_columns = [col for col, is_empty in empty_columns.items() if is_empty]

# Display the list of entirely empty columns across all files
print("Entirely empty columns across all files:")
for col in entirely_empty_columns:
    print(col)


In [None]:
import pandas as pd
import os
from collections import defaultdict

# Directory where your CSV files are stored
csv_folder_path = r"C:\Users\Rex Fuentes\Documents\citigroup_case_study\loan_level_data"

# Dictionary to keep track of empty columns across all files
empty_columns = defaultdict(lambda: True)  # Assume all columns are empty initially

# Iterate over each file in the folder
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to read the CSV file
        df = pd.read_csv(file_path)

        # Check each column in the DataFrame
        for column in df.columns:
            # If a column is found to be non-empty in any file, mark it as such
            if empty_columns[column] and not df[column].isnull().all():
                empty_columns[column] = False

# Filter out the columns that are not empty in any file
entirely_empty_columns = [col for col, is_empty in empty_columns.items() if is_empty]

# Display the list of entirely empty columns across all files
print("Entirely empty columns across all files:")
for col in entirely_empty_columns:
    print(col)


In [None]:
import pandas as pd
from sqlalchemy import create_engine
import os
import configparser

# Function to drop entirely empty columns
def drop_empty_columns(df):
    return df.dropna(axis=1, how='all')

# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Database connection URL
db_connection_url = f"postgresql+psycopg2://{config['database']['user']}:{config['database']['password']}@{config['database']['host']}:{config['database']['port']}/{config['database']['database']}"

# Create a SQLAlchemy engine
engine = create_engine(db_connection_url)

# Directory where your CSV files are stored
csv_folder_path = config['files']['enhanced_loan_level_dir']

# Iterate over each file in the folder and insert data into the database
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to load the CSV file
        df = pd.read_csv(file_path)

        # Drop columns that are entirely empty
        df = drop_empty_columns(df)

        # Use 'to_sql' to insert the data into the database, it creates a table if it does not exist
        df.to_sql('enhanced_loan_level_dir', engine, if_exists='append', index=False)

print("Loading data completed. All empty columns were dropped.")


In [None]:
import pandas as pd
from sqlalchemy import create_engine
import os
import configparser

# Function to drop entirely empty columns
def drop_empty_columns(df):
    return df.dropna(axis=1, how='all')

# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Database connection URL
db_connection_url = f"postgresql+psycopg2://{config['database']['user']}:{config['database']['password']}@{config['database']['host']}:{config['database']['port']}/{config['database']['database']}"

# Create a SQLAlchemy engine
engine = create_engine(db_connection_url)

# Directory where your CSV files are stored
csv_folder_path = config['files']['loan_level_data_download_dir']

# Iterate over each file in the folder and insert data into the database
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to load the CSV file
        df = pd.read_csv(file_path)

        # Drop columns that are entirely empty
        df = drop_empty_columns(df)

        # Use 'to_sql' to insert the data into the database, it creates a table if it does not exist
        df.to_sql('loan_level_data', engine, if_exists='append', index=False)

print("Loading data completed. All empty columns were dropped.")
