In [1]:
import pandas as pd
import numpy as np
import os
import sqlite3
from datetime import datetime


In [2]:
RAW_PATH = "C://Users//Rachit//OneDrive//Documents//CreditPathAI//data//raw//"
PROCESSED_PATH = "C://Users//Rachit//OneDrive//Documents//CreditPathAI//data//processed//"
DB_PATH = "C://Users//Rachit//OneDrive//Documents//CreditPathAI//data//processed//creditpath.db"

# Create folders if missing
os.makedirs(PROCESSED_PATH, exist_ok=True)


In [3]:
files = [f for f in os.listdir(RAW_PATH) if f.endswith('.csv')]
files


['HC_application_train.csv',
 'HC_bureau.csv',
 'HC_bureau_balance.csv',
 'HC_credit_card_balance.csv',
 'HC_installments_payments.csv',
 'HC_POS_CASH_balance.csv',
 'HC_previous_application.csv',
 'HC_sample_submission.csv',
 'HomeCredit_columns_description.csv']

In [4]:
def safe_read_csv(path, nrows=None):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    
    for enc in encodings:
        try:
            return pd.read_csv(path, nrows=nrows, encoding=enc)
        except UnicodeDecodeError:
            continue
    
    raise ValueError(f"Unable to read {path} with known encodings")


In [5]:
datasets = {}

for f in files:
    path = RAW_PATH + f
    df_sample = safe_read_csv(path, nrows=5)
    datasets[f] = df_sample

datasets.keys()


dict_keys(['HC_application_train.csv', 'HC_bureau.csv', 'HC_bureau_balance.csv', 'HC_credit_card_balance.csv', 'HC_installments_payments.csv', 'HC_POS_CASH_balance.csv', 'HC_previous_application.csv', 'HC_sample_submission.csv', 'HomeCredit_columns_description.csv'])

In [6]:
full_data = {}
shapes = {}

for f in files:
    path = RAW_PATH + f
    df_full = safe_read_csv(path)
    full_data[f] = df_full
    shapes[f] = df_full.shape

shapes


{'HC_application_train.csv': (307511, 122),
 'HC_bureau.csv': (1716428, 17),
 'HC_bureau_balance.csv': (27299925, 3),
 'HC_credit_card_balance.csv': (3840312, 23),
 'HC_installments_payments.csv': (13605401, 8),
 'HC_POS_CASH_balance.csv': (10001358, 8),
 'HC_previous_application.csv': (1670214, 37),
 'HC_sample_submission.csv': (48744, 2),
 'HomeCredit_columns_description.csv': (219, 5)}

In [7]:
validation_report = {}

for name, df in full_data.items():
    validation_report[name] = {
        "rows": len(df),
        "cols": df.shape[1],
        "missing_values": df.isnull().sum().sum(),
        "duplicate_rows": df.duplicated().sum()
    }

validation_report


{'HC_application_train.csv': {'rows': 307511,
  'cols': 122,
  'missing_values': 9152465,
  'duplicate_rows': 0},
 'HC_bureau.csv': {'rows': 1716428,
  'cols': 17,
  'missing_values': 3939947,
  'duplicate_rows': 0},
 'HC_bureau_balance.csv': {'rows': 27299925,
  'cols': 3,
  'missing_values': 0,
  'duplicate_rows': 0},
 'HC_credit_card_balance.csv': {'rows': 3840312,
  'cols': 23,
  'missing_values': 5877356,
  'duplicate_rows': 0},
 'HC_installments_payments.csv': {'rows': 13605401,
  'cols': 8,
  'missing_values': 5810,
  'duplicate_rows': 0},
 'HC_POS_CASH_balance.csv': {'rows': 10001358,
  'cols': 8,
  'missing_values': 52158,
  'duplicate_rows': 0},
 'HC_previous_application.csv': {'rows': 1670214,
  'cols': 37,
  'missing_values': 11109336,
  'duplicate_rows': 0},
 'HC_sample_submission.csv': {'rows': 48744,
  'cols': 2,
  'missing_values': 0,
  'duplicate_rows': 0},
 'HomeCredit_columns_description.csv': {'rows': 219,
  'cols': 5,
  'missing_values': 133,
  'duplicate_rows': 0}

In [8]:
def clean_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.lower()
    )
    return df

for name, df in full_data.items():
    full_data[name] = clean_columns(df)


In [10]:
for name, df in full_data.items():
    out_name = name.replace(".csv", "_clean.csv")
    df.to_csv(PROCESSED_PATH + out_name, index=False)

"Processed CSVs saved successfully."


'Processed CSVs saved successfully.'

In [11]:
conn = sqlite3.connect(DB_PATH)

for name, df in full_data.items():
    table_name = name.replace(".csv", "").lower()
    df.to_sql(table_name, conn, if_exists='replace', index=False)

conn.close()
"All tables saved into SQLite DB."


'All tables saved into SQLite DB.'