In this notebook, we load the Lending Club loan data using a config file and perform initial inspection.

**Goals:**
- Verify dataset shape
- View initial rows
- Check for issues before cleaning

In [1]:
# Import all neccesary libraries
import pandas as pd
import yaml

# Load file path from confi.yaml
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Load the dataset
df = pd.read_csv(config['data_path'], compression='gzip')

# Basic Inspection of dataset
print("Dataset loaded successfully")
print("Shape of the dataset (rows, columns):", df.shape)

# Displaying first 5 rows
df.head()

# Displaying all column names
print("Column names:")
print(df.columns.tolist())

# Check column types and non-null counts
print("Data types and non-null counts:")
df.info()

# Show missing value summary
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing, 'Missing %': missing_percent})
missing_df = missing_df[missing_df['Missing Values'] > 0]
missing_df.sort_values('Missing %', ascending=False).head(20)


  df = pd.read_csv(config['data_path'], compression='gzip')


Dataset loaded successfully
Shape of the dataset (rows, columns): (2260701, 151)
Column names:
['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'app

Unnamed: 0,Missing Values,Missing %
member_id,2260701,100.0
orig_projected_additional_accrued_interest,2252050,99.617331
hardship_payoff_balance_amount,2249784,99.517097
hardship_last_payment_amount,2249784,99.517097
payment_plan_start_date,2249784,99.517097
hardship_type,2249784,99.517097
hardship_status,2249784,99.517097
hardship_start_date,2249784,99.517097
deferral_term,2249784,99.517097
hardship_amount,2249784,99.517097
