In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.data_preprocessing import LoanDataPreprocessor
from src.data_validator import DataValidator
from src.config import *


In [8]:
df = pd.read_csv(r'C:\Users\LENOVO\Documents\LOAN DATA PROJECT\data\processed\loan_data_clean.csv')
print(f"Columns: {list(df.columns)}")
df.head()

Columns: ['id', 'address_state', 'application_type', 'emp_length', 'emp_title', 'grade', 'home_ownership', 'issue_date', 'last_credit_pull_date', 'last_payment_date', 'loan_status', 'next_payment_date', 'member_id', 'purpose', 'sub_grade', 'term', 'verification_status', 'annual_income', 'dti', 'installment', 'int_rate', 'loan_amount', 'total_acc', 'total_payment', 'loan_category', 'issue_year', 'issue_month', 'issue_month_name', 'income_bracket', 'dti_category']


Unnamed: 0,id,address_state,application_type,emp_length,emp_title,grade,home_ownership,issue_date,last_credit_pull_date,last_payment_date,...,int_rate,loan_amount,total_acc,total_payment,loan_category,issue_year,issue_month,issue_month_name,income_bracket,dti_category
0,1077430,GA,INDIVIDUAL,< 1 year,Ryder,C,RENT,2021-05-07,2021-04-12,2021-07-13,...,0.1527,2500.0,4,1009.0,Bad Loan,2021,5,May,<30K,Low Risk
1,1072053,CA,INDIVIDUAL,9 years,MKC Accounting,E,RENT,2021-05-24,2021-09-19,2021-08-28,...,0.1864,3000.0,4,3939.0,Good Loan,2021,5,May,30-50K,Low Risk
2,1069243,CA,INDIVIDUAL,4 years,Chemat Technology Inc,C,RENT,2021-03-18,2021-06-07,2021-01-30,...,0.1596,12000.0,11,3522.0,Bad Loan,2021,3,March,30-50K,High Risk
3,1041756,TX,INDIVIDUAL,< 1 year,barnes distribution,B,MORTGAGE,2021-04-22,2021-03-24,2021-01-08,...,0.1065,4500.0,9,4911.0,Good Loan,2021,4,April,30-50K,Low Risk
4,1068350,IL,INDIVIDUAL,10+ years,J&J Steel Inc,A,MORTGAGE,2021-01-25,2021-06-26,2021-08-12,...,0.0603,3500.0,28,3835.0,Good Loan,2021,1,January,75-100K,Low Risk


In [9]:
# Create validator instance and run validation
validator = DataValidator(df)
validation_report = validator.generate_report()

print("=" * 40)
for key, value in validation_report.items():
    if key == 'missing_values':
        missing_count = sum([v for v in value.values() if v > 0])
        print(f"Missing Values: {missing_count} total missing entries")
    elif key == 'outliers':
        print(f"Outliers detected in {len(value)} columns")
    else:
        print(f"{key}: {len(value) if isinstance(value, (list, dict)) else value}")

missing_columns: 0
Missing Values: 0 total missing entries
data_types: 30
Outliers detected in 4 columns


In [10]:
# Initialize Preprocessor and Run Pipeline
print("\nüîß Starting data preprocessing pipeline...")

# Create preprocessor instance
preprocessor = LoanDataPreprocessor(df)

print("Step 1: Cleaning column names...")
preprocessor.clean_column_names()

print("Step 2: Handling missing values...")
preprocessor.handle_missing_values()

print("Step 3: Converting data types...")
preprocessor.convert_data_types()

print("Step 4: Creating derived features...")
preprocessor.create_derived_features()

print("Step 5: Removing outliers...")
preprocessor.remove_outliers()

print(" Preprocessing pipeline completed!")



üîß Starting data preprocessing pipeline...
üèóÔ∏è Initialized preprocessor with 35831 records
Step 1: Cleaning column names...
üßπ Cleaning column names...
Step 2: Handling missing values...
üîß Handling missing values...
   üîç Checking issue_date - sample values: ['2021-05-07', '2021-05-24', '2021-03-18', '2021-04-22', '2021-01-25']
   ‚úÖ issue_date: No missing values found
   üîç Checking last_credit_pull_date - sample values: ['2021-04-12', '2021-09-19', '2021-06-07', '2021-03-24', '2021-06-26']
   ‚úÖ last_credit_pull_date: No missing values found
   üîç Checking last_payment_date - sample values: ['2021-07-13', '2021-08-28', '2021-01-30', '2021-01-08', '2021-08-12']
   ‚úÖ last_payment_date: No missing values found
   üîç Checking next_payment_date - sample values: ['2021-12-22', '2021-05-12', '2021-11-28', '2021-07-22', '2021-12-23']
   ‚úÖ next_payment_date: No missing values found
Step 3: Converting data types...
üîÑ Converting data types...
   üìÖ Converting issu

In [11]:
# Cell 5: Get Processed Data and Summary
print("\nüìã Getting preprocessing summary...")

# Get the clean dataframe
clean_df = preprocessor.get_clean_data()

# Get preprocessing summary
summary = preprocessor.get_preprocessing_summary()

print("\nüìä PREPROCESSING SUMMARY:")
print("=" * 40)
print(f"Total Records: {summary['total_records']}")
print(f"Total Columns: {summary['total_columns']}")
print(f"Processing Steps Completed: {len(summary['processing_steps'])}")

print("\nüìù Processing Steps:")
for i, step in enumerate(summary['processing_steps'], 1):
    print(f"   {i}. {step}")

print(f"\nüìà Final Data Shape: {clean_df.shape}")


üìã Getting preprocessing summary...

üìä PREPROCESSING SUMMARY:
Total Records: 35274
Total Columns: 30
Processing Steps Completed: 15

üìù Processing Steps:
   1. Column names standardized
   2. Converted issue_date: 35831/35831 successful
   3. Converted last_credit_pull_date: 35831/35831 successful
   4. Converted last_payment_date: 35831/35831 successful
   5. Converted next_payment_date: 35831/35831 successful
   6. Converted annual_income to float64
   7. Converted dti to float64
   8. Converted installment to float64
   9. Converted int_rate to float64
   10. Converted loan_amount to float64
   11. Converted total_payment to float64
   12. Converted total_acc to int64
   13. Extracted date features from issue_date
   14. Created all derived features
   15. Removed 557 outlier records

üìà Final Data Shape: (35274, 30)


In [12]:
# Cell 6: Verify New Features Created
print("\nüéØ Verifying derived features created by preprocessor...")

# Check loan categories
print("Loan Category Distribution:")
print(clean_df['loan_category'].value_counts())

# Check date-based features  
if 'issue_year' in clean_df.columns:
    print(f"\nIssue Year Range: {clean_df['issue_year'].min()} to {clean_df['issue_year'].max()}")
    print("Issue Month Distribution:")
    print(clean_df['issue_month_name'].value_counts())

# Check income brackets
if 'income_bracket' in clean_df.columns:
    print("\nIncome Bracket Distribution:")
    print(clean_df['income_bracket'].value_counts())

# Check DTI categories
if 'dti_category' in clean_df.columns:
    print("\nDTI Risk Category Distribution:")
    print(clean_df['dti_category'].value_counts())



üéØ Verifying derived features created by preprocessor...
Loan Category Distribution:
loan_category
Good Loan    30400
Bad Loan      4874
Name: count, dtype: int64

Issue Year Range: 2021 to 2021
Issue Month Distribution:
issue_month_name
March        3137
October      3089
August       3029
May          3000
December     2990
July         2983
January      2958
November     2900
June         2871
September    2844
April        2821
February     2652
Name: count, dtype: int64

Income Bracket Distribution:
income_bracket
50-75K     11530
30-50K     10732
75-100K     5836
<30K        4225
>100K       2951
Name: count, dtype: int64

DTI Risk Category Distribution:
dti_category
Medium Risk       17066
Low Risk          11324
High Risk          6884
Very High Risk        0
Name: count, dtype: int64


In [13]:
# Cell 7: Data Quality Checks
print("\nüîç Post-processing data quality checks...")

# Check for remaining missing values
missing_after = clean_df.isnull().sum()
missing_cols = missing_after[missing_after > 0]

if len(missing_cols) > 0:
    print("‚ö†Ô∏è  Remaining missing values:")
    for col, count in missing_cols.items():
        print(f"   {col}: {count} missing values")
else:
    print("‚úÖ No missing values remaining")

# Check data types
print(f"\nüìä Data types summary:")
dtype_summary = clean_df.dtypes.value_counts()
for dtype, count in dtype_summary.items():
    print(f"   {dtype}: {count} columns")

# Display final data sample
print(f"\nüìã Sample of clean data:")
clean_df.head()



üîç Post-processing data quality checks...
‚úÖ No missing values remaining

üìä Data types summary:
   object: 13 columns
   float64: 6 columns
   int64: 5 columns
   datetime64[ns]: 4 columns
   category: 1 columns
   category: 1 columns

üìã Sample of clean data:


Unnamed: 0,id,address_state,application_type,emp_length,emp_title,grade,home_ownership,issue_date,last_credit_pull_date,last_payment_date,...,int_rate,loan_amount,total_acc,total_payment,loan_category,issue_year,issue_month,issue_month_name,income_bracket,dti_category
0,1077430,GA,INDIVIDUAL,< 1 year,Ryder,C,RENT,2021-10-06,2021-10-04,2021-04-03,...,0.1527,2500.0,4,1009.0,Bad Loan,2021,10,October,<30K,Low Risk
1,1072053,CA,INDIVIDUAL,9 years,MKC Accounting,E,RENT,2021-04-12,2021-05-20,2021-11-27,...,0.1864,3000.0,4,3939.0,Good Loan,2021,4,April,30-50K,Low Risk
2,1069243,CA,INDIVIDUAL,4 years,Chemat Technology Inc,C,RENT,2021-08-16,2021-02-10,2021-12-10,...,0.1596,12000.0,11,3522.0,Bad Loan,2021,8,August,30-50K,High Risk
3,1041756,TX,INDIVIDUAL,< 1 year,barnes distribution,B,MORTGAGE,2021-02-15,2021-06-03,2021-03-12,...,0.1065,4500.0,9,4911.0,Good Loan,2021,2,February,30-50K,Low Risk
4,1068350,IL,INDIVIDUAL,10+ years,J&J Steel Inc,A,MORTGAGE,2021-08-09,2021-03-31,2021-03-24,...,0.0603,3500.0,28,3835.0,Good Loan,2021,8,August,75-100K,Low Risk


In [14]:
# Save the clean data
clean_df.to_csv('../data/processed/loan_data_clean.csv', index=False)
clean_df.to_csv('../data/exports/powerbi_data.csv', index=False)

In [15]:
# Save preprocessing summary as JSON for reference
import json
summary_for_json = {
    'total_records': summary['total_records'],
    'total_columns': summary['total_columns'], 
    'processing_steps': summary['processing_steps'],
    'data_types': {k: str(v) for k, v in summary['data_types'].items()},
    'missing_values': summary['missing_values']
}

with open('../data/processed/preprocessing_summary.json', 'w') as f:
    json.dump(summary_for_json, f, indent=2)

print("‚úÖ Clean data saved to: ../data/processed/loan_data_clean.csv")
print("‚úÖ Power BI export saved to: ../data/exports/powerbi_data.csv") 
print("‚úÖ Processing summary saved to: ../data/processed/preprocessing_summary.json")

‚úÖ Clean data saved to: ../data/processed/loan_data_clean.csv
‚úÖ Power BI export saved to: ../data/exports/powerbi_data.csv
‚úÖ Processing summary saved to: ../data/processed/preprocessing_summary.json
