In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
app_train = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/application_train.csv")
app_test = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/application_test.csv")
credit_card_balance = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/credit_card_balance.csv")
bureau = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/bureau.csv")
previous = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/previous_application.csv")
installments = pd.read_csv("C:/Users/SMRUTI DESHPANDE/house credit default/installments_payments.csv")

In [3]:
pip install dask[dataframe]

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [4]:
import dask.dataframe as dd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
import os

# Function to reduce memory usage
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:  # Exclude string columns
            if pd.api.types.is_integer_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='integer')
            elif pd.api.types.is_float_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='float')
    return df

app_train = reduce_memory_usage(app_train)
app_test = reduce_memory_usage(app_test)
credit_card_balance = reduce_memory_usage(credit_card_balance)

#columns from credit_card_balance for merging
columns_to_merge = ['SK_ID_CURR', 'AMT_BALANCE', 'SK_DPD']
credit_card_balance_selected = credit_card_balance[columns_to_merge]


app_train_dd = dd.from_pandas(app_train, npartitions=10)
credit_card_balance_dd = dd.from_pandas(credit_card_balance_selected, npartitions=10)
app_test_dd = dd.from_pandas(app_test[['SK_ID_CURR']], npartitions=10)

#Merging DataFrames on SK_ID_CURR using Dask
merged_data_dd = dd.merge(app_train_dd, credit_card_balance_dd, on='SK_ID_CURR', how='left')
merged_data_dd = dd.merge(merged_data_dd, app_test_dd, on='SK_ID_CURR', how='left')

merged_data = merged_data_dd.compute()

#Handle missing values
imputer = SimpleImputer(strategy='mean')
merged_data['AMT_BALANCE'] = imputer.fit_transform(merged_data[['AMT_BALANCE']])
merged_data['SK_DPD'] = SimpleImputer(strategy='median').fit_transform(merged_data[['SK_DPD']])

#binary categorical features to numeric
binary_map = {'Y': 1, 'N': 0, 'M': 0, 'F': 1}
merged_data['FLAG_OWN_CAR'] = merged_data['FLAG_OWN_CAR'].map(binary_map)
merged_data['CODE_GENDER'] = merged_data['CODE_GENDER'].map(binary_map)

#Label encode multi-category columns
multi_category_columns = ['NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE']
label_encoders = {}
for col in multi_category_columns:
    le = LabelEncoder()
    merged_data[col] = le.fit_transform(merged_data[col])
    label_encoders[col] = le 

input_parameters = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 'CNT_CHILDREN',
    'FLAG_OWN_CAR', 'CODE_GENDER', 'DAYS_CREDIT', 'DAYS_DECISION', 'AMT_PAYMENT', 
    'AMT_INSTALMENT', 'AMT_APPLICATION'] + multi_category_columns


available_columns = [col for col in input_parameters if col in merged_data.columns]


numerical_features = [col for col in available_columns if col in [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 
    'CNT_CHILDREN', 'DAYS_CREDIT', 'DAYS_DECISION', 'AMT_PAYMENT', 'AMT_INSTALMENT', 'AMT_APPLICATION']]

# Split data into features and target
training_data = merged_data[available_columns + ['TARGET']]
X = training_data[available_columns]
y = training_data['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

save_dir = 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\'
os.makedirs(save_dir, exist_ok=True) 

binary_map = {'Y': 1, 'N': 0, 'M': 0, 'F': 1}

def calculate_credit_score(input_data):
    # Payment history score
    sk_dpd = input_data.get('SK_DPD', 0) 
    amt_payment = input_data.get('AMT_PAYMENT', 0)
    amt_installment = input_data.get('AMT_INSTALMENT', 0)
    
    # Calculation of payment history score based on SK_DPD, AMT_PAYMENT, and AMT_INSTALMENT
    if amt_installment > 0:
        payment_ratio = min(amt_payment / amt_installment, 1)
    else:
        payment_ratio = 1 
    payment_history_score = max(0, 1 - sk_dpd / 100) * 0.35
    
    # calculation of Credit utilization score based on AMT_BALANCE and AMT_CREDIT
    amt_balance = input_data.get('AMT_BALANCE', 0)
    amt_credit = input_data.get('AMT_CREDIT', 1) 
    credit_utilization_ratio = amt_balance / amt_credit
    credit_utilization_score = max(0, 1 - credit_utilization_ratio) * 0.3
    
    #calculation of Length of credit history score based on DAYS_CREDIT and DAYS_DECISION
    days_credit = input_data.get('DAYS_CREDIT')
    days_decision = input_data.get('DAYS_DECISION')
    if days_credit is not None and days_decision is not None:
        credit_history_length = abs(days_credit - days_decision)
        length_of_credit_history_score = min(1, credit_history_length / 3650) * 0.15  
    else:
        length_of_credit_history_score = 0  
    
    #calculation of Credit mix score based on contract type and credit type
    name_contract_type = input_data.get('NAME_CONTRACT_TYPE', 'Unknown')
    credit_type_score = 0.1 if name_contract_type in ['Cash loans', 'Revolving loans'] else 0
    credit_mix_score = credit_type_score * 0.1 
    
    #calculation of New credit score based on AMT_APPLICATION
    amt_application = input_data.get('AMT_APPLICATION', 0)
    new_credit_score = min(1, amt_application / 500000) * 0.1 
    
    #Calculation of total credit score
    total_credit_score = (payment_history_score + credit_utilization_score +
                          length_of_credit_history_score + credit_mix_score + new_credit_score) * 850 
    
    return int(total_credit_score)

def predict_default(input_data):
    input_data = {k: v for k, v in input_data.items() if k in available_columns}
    
    input_data['FLAG_OWN_CAR'] = binary_map.get(input_data.get('FLAG_OWN_CAR', 'N'), 0)
    input_data['CODE_GENDER'] = binary_map.get(input_data.get('CODE_GENDER', 'M'), 0)

    for col, le in label_encoders.items():
        if col in input_data:
            input_data[col] = le.transform([input_data[col]])[0]

    input_df = pd.DataFrame([input_data])
    input_df[numerical_features] = scaler.transform(input_df[numerical_features])

    is_defaulter = rf_model.predict(input_df)[0]

    credit_score = calculate_credit_score(input_data)
    fico_range = determine_fico_range(credit_score)

    if fico_range in ['Poor', 'Fair']:
        is_defaulter = 1 

    return ("Defaulter" if is_defaulter else "Non-Defaulter"), credit_score, fico_range

# FICO range 
def determine_fico_range(credit_score):
    if credit_score >= 800:
        return "Exceptional"
    elif credit_score >= 740:
        return "Very Good"
    elif credit_score >= 670:
        return "Good"
    elif credit_score >= 580:
        return "Fair"
    else:
        return "Poor"

# Example for prediction
input_example = {
    'AMT_INCOME_TOTAL': 50000,
    'AMT_CREDIT': 200000,
    'AMT_BALANCE': 150000,
    'AMT_ANNUITY': 15000,
    'SK_DPD': 5,
    'CNT_CHILDREN': 1,
    'FLAG_OWN_CAR': 'Y',
    'CODE_GENDER': 'M',
    'NAME_FAMILY_STATUS': 'Married',
    'NAME_INCOME_TYPE': 'Working',
    'NAME_HOUSING_TYPE': 'House / apartment',
    'NAME_CONTRACT_TYPE': 'Cash loans',
    'DAYS_CREDIT': -1000,
    'DAYS_DECISION': -500,
    'AMT_PAYMENT': 5000,
    'AMT_INSTALMENT': 4000,
    'AMT_APPLICATION': 250000
}

result = predict_default(input_example)
print(result)

('Defaulter', 346, 'Poor')


In [13]:
import joblib

# Save model and scaler
joblib.dump(rf_model, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\random_forest_model_pipeline.pkl')
joblib.dump(scaler, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\scaler.pkl')

label_encoder_NAME_FAMILY_STATUS = LabelEncoder()
label_encoder_NAME_FAMILY_STATUS.fit(merged_data['NAME_FAMILY_STATUS'])

label_encoder_NAME_INCOME_TYPE = LabelEncoder()
label_encoder_NAME_INCOME_TYPE.fit(merged_data['NAME_INCOME_TYPE'])

label_encoder_NAME_HOUSING_TYPE = LabelEncoder()
label_encoder_NAME_HOUSING_TYPE.fit(merged_data['NAME_HOUSING_TYPE'])

label_encoder_NAME_CONTRACT_TYPE = LabelEncoder()
label_encoder_NAME_CONTRACT_TYPE.fit(merged_data['NAME_CONTRACT_TYPE'])

# Save each encoder
joblib.dump(label_encoder_NAME_FAMILY_STATUS, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_FAMILY_STATUS.pkl')
joblib.dump(label_encoder_NAME_INCOME_TYPE, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_INCOME_TYPE.pkl')
joblib.dump(label_encoder_NAME_HOUSING_TYPE, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_HOUSING_TYPE.pkl')
joblib.dump(label_encoder_NAME_CONTRACT_TYPE, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_CONTRACT_TYPE.pkl')


['C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_CONTRACT_TYPE.pkl']

In [14]:
import sklearn
print(sklearn.__version__)

1.5.2
