### I used this script to check for csv files that contains the column "Beginning Deferred Balance", which apparently is not present in all the csv files of Enhanced Loan-Level Data.

In [None]:
import pandas as pd
import os

# Directory where your CSV files are stored
csv_folder_path = r"C:\Users\Rex Fuentes\Documents\Data Engineering Case Study\citigroup_case_study\enhanced_loan_level_data"  

# Column name to check
column_name = "Beginning Deferred Balance"

# List to store names of files containing the column
files_with_column = []

# Iterate over each file in the folder
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to read the first row of the CSV file
        df = pd.read_csv(file_path, nrows=1)

        # Check if the column exists
        if column_name in df.columns:
            files_with_column.append(csv_file)

# Display the list of files that contain the column
print("Files containing the column '{}':".format(column_name))
for file in files_with_column:
    print(file)


### This script is to check if the column Beginning Deferred Balance is empty in all the csv files that it is present.

In [None]:
import pandas as pd
import os

# Directory where your CSV files are stored
csv_folder_path = r"C:\Users\Rex Fuentes\Documents\Data Engineering Case Study\citigroup_case_study\enhanced_loan_level_data"  

# Column name to check
column_name = "Beginning Deferred Balance"

# Lists to store names of files containing the column
files_with_column = []
files_with_empty_column = []

# Iterate over each file in the folder
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to read the CSV file
        df = pd.read_csv(file_path)

        # Check if the column exists
        if column_name in df.columns:
            files_with_column.append(csv_file)

            # Check if the column is entirely empty
            if df[column_name].isnull().all():
                files_with_empty_column.append(csv_file)

# Display the list of files that contain the column
print("Files containing the column '{}':".format(column_name))
for file in files_with_column:
    print(file)

# Display the list of files where the column is entirely empty
print("\nFiles with an entirely empty column '{}':".format(column_name))
for file in files_with_empty_column:
    print(file)


### I realized that there may be columns that are present but empty across all files. This is the script that I used to detect those columns.

In [None]:
import pandas as pd
import os
from collections import defaultdict

# Directory where your CSV files are stored
csv_folder_path = r"C:\Users\Rex Fuentes\Documents\Data Engineering Case Study\citigroup_case_study\enhanced_loan_level_data"  


# Dictionary to keep track of empty columns across all files
empty_columns = defaultdict(lambda: True)  # Assume all columns are empty initially

# Iterate over each file in the folder
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to read the CSV file
        df = pd.read_csv(file_path)

        # Check each column in the DataFrame
        for column in df.columns:
            # If a column is found to be non-empty in any file, mark it as such
            if empty_columns[column] and not df[column].isnull().all():
                empty_columns[column] = False

# Filter out the columns that are not empty in any file
entirely_empty_columns = [col for col, is_empty in empty_columns.items() if is_empty]

# Display the list of entirely empty columns across all files
print("Entirely empty columns across all files:")
for col in entirely_empty_columns:
    print(col)


### This is why I decided to drop those empty columns across all files below loading these csv files to the database.

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import os
import configparser

# Function to drop entirely empty columns
def drop_empty_columns(df):
    return df.dropna(axis=1, how='all')

# Read database credentials and file directory from a config file
config = configparser.ConfigParser()
config.read('config.ini')

# Database connection URL
db_connection_url = f"postgresql+psycopg2://{config['database']['user']}:{config['database']['password']}@{config['database']['host']}:{config['database']['port']}/{config['database']['database']}"

# Create a SQLAlchemy engine
engine = create_engine(db_connection_url)

# Directory where your CSV files are stored
csv_folder_path = config['files']['enhanced_loan_level_dir']

# Iterate over each file in the folder and insert data into the database
for csv_file in os.listdir(csv_folder_path):
    if csv_file.endswith('.csv'):
        file_path = os.path.join(csv_folder_path, csv_file)

        # Use Pandas to load the CSV file
        df = pd.read_csv(file_path)

        # Drop columns that are entirely empty
        df = drop_empty_columns(df)

        # Use 'to_sql' to insert the data into the database, it creates a table if it does not exist
        df.to_sql('enhanced_loan_level_dir', engine, if_exists='append', index=False)

print("Loading data completed. All empty columns were dropped.")
