In [1]:
pip install catboost dask[dataframe]


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
from dotenv import load_dotenv
import gdown
import requests
import dask.dataframe as dd


# Load environment variables from the .env file
load_dotenv()

# Function to construct Google Drive direct download link
def get_google_drive_url(file_id):
    return f"https://drive.google.com/uc?id={file_id}"

# Get file IDs from the .env file
file_ids = {
    "application_train": os.getenv("APPLICATION_TRAIN_ID"),
    "application_test": os.getenv("APPLICATION_TEST_ID"),
    "bureau": os.getenv("BUREAU_ID"),
    "bureau_balance": os.getenv("BUREAU_BALANCE_ID"),
    "credit_card_balance": os.getenv("CREDIT_CARD_BALANCE_ID"),
    "installments_payments": os.getenv("INSTALLMENTS_PAYMENTS_ID"),
    "previous_application": os.getenv("PREVIOUS_APPLICATION_ID"),
    "POS_CASH_balance": os.getenv("POS_CASH_BALANCE_ID"),
}

# Construct direct download links
google_drive_links = {key: get_google_drive_url(value) for key, value in file_ids.items()}

# Function to download a file using gdown, only if not already downloaded
def download_csv(file_url, output_path):
    if not os.path.exists(output_path):  # Check if file already exists
        print(f"Downloading {output_path}...")
        try:
            gdown.download(file_url, output_path, quiet=False)
            print(f"Downloaded {output_path}")
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {file_url}: {e}")
    else:
        print(f"{output_path} already exists.")

# Define the output file paths
output_paths = {
    "application_train": "application_train.csv",
    "application_test": "application_test.csv",
    "bureau": "bureau.csv",
    "bureau_balance": "bureau_balance.csv",
    "credit_card_balance": "credit_card_balance.csv",
    "installments_payments": "installments_payments.csv",
    "previous_application": "previous_application.csv",
    "POS_CASH_balance": "POS_CASH_balance.csv"
}

# Download the datasets
for key, file_url in google_drive_links.items():
    download_csv(file_url, output_paths[key])

# Load datasets from local files
try:
    app_train = dd.read_csv(output_paths["application_train"], on_bad_lines='skip')
    app_test = dd.read_csv(output_paths["application_test"], on_bad_lines='skip')
    bureau = dd.read_csv(output_paths["bureau"], on_bad_lines='skip')
    bureau_balance = dd.read_csv(output_paths["bureau_balance"], on_bad_lines='skip')
    credit_card_balance = dd.read_csv(output_paths["credit_card_balance"], on_bad_lines='skip')
    installments_payments = dd.read_csv(output_paths["installments_payments"], on_bad_lines='skip')
    previous_application = dd.read_csv(output_paths["previous_application"], on_bad_lines='skip')
    POS_CASH_balance = dd.read_csv(output_paths["POS_CASH_balance"], on_bad_lines='skip')
except Exception as e:
    print(f"Error loading CSV files: {e}")

# Example: Print the first few rows of the application_train dataset
print(app_train.head())


application_train.csv already exists.
application_test.csv already exists.
bureau.csv already exists.
bureau_balance.csv already exists.
credit_card_balance.csv already exists.
installments_payments.csv already exists.
previous_application.csv already exists.
POS_CASH_balance.csv already exists.
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    1

In [4]:
import dask.dataframe as dd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import joblib
import os
import pandas as pd

# Function to reduce memory usage
def reduce_memory_usage(df):
    # Check if the input is a Dask DataFrame
    if isinstance(df, dd.DataFrame):
        # Compute the Dask DataFrame to bring it into memory as a pandas DataFrame
        df = df.compute()
    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:  # Exclude string columns
            if pd.api.types.is_integer_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='integer', errors='ignore')
            elif pd.api.types.is_float_dtype(col_type):
                df[col] = pd.to_numeric(df[col], downcast='float', errors='ignore')
    return df

# Ensure app_train, app_test, and credit_card_balance are pandas DataFrames
app_train = app_train.persist() if isinstance(app_train, dd.DataFrame) else app_train
app_test = app_test.persist() if isinstance(app_test, dd.DataFrame) else app_test
credit_card_balance = credit_card_balance.persist() if isinstance(credit_card_balance, dd.DataFrame) else credit_card_balance

# Reduce memory usage for the datasets after ensuring they are pandas DataFrames
app_train = reduce_memory_usage(app_train)
app_test = reduce_memory_usage(app_test)
credit_card_balance = reduce_memory_usage(credit_card_balance)

# Columns from credit_card_balance for merging
columns_to_merge = ['SK_ID_CURR', 'AMT_BALANCE', 'SK_DPD']
credit_card_balance_selected = credit_card_balance[columns_to_merge]

# Convert to Dask DataFrame with multiple partitions
app_train_dd = dd.from_pandas(app_train, npartitions=10)
credit_card_balance_dd = dd.from_pandas(credit_card_balance_selected, npartitions=10)
app_test_dd = dd.from_pandas(app_test[['SK_ID_CURR']], npartitions=10)

# Merging DataFrames on SK_ID_CURR using Dask
merged_data_dd = dd.merge(app_train_dd, credit_card_balance_dd, on='SK_ID_CURR', how='left')
merged_data_dd = dd.merge(merged_data_dd, app_test_dd, on='SK_ID_CURR', how='left')

# Compute Dask DataFrame into a pandas DataFrame
merged_data = merged_data_dd.compute()

# Handle missing values
imputer = SimpleImputer(strategy='mean')
merged_data['AMT_BALANCE'] = imputer.fit_transform(merged_data[['AMT_BALANCE']])
merged_data['SK_DPD'] = SimpleImputer(strategy='median').fit_transform(merged_data[['SK_DPD']])

# Binary categorical features to numeric
binary_map = {'Y': 1, 'N': 0, 'M': 0, 'F': 1}
merged_data['FLAG_OWN_CAR'] = merged_data['FLAG_OWN_CAR'].map(binary_map)
merged_data['CODE_GENDER'] = merged_data['CODE_GENDER'].map(binary_map)

# Label encode multi-category columns
multi_category_columns = ['NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE', 'NAME_CONTRACT_TYPE']
label_encoders = {}
for col in multi_category_columns:
    le = LabelEncoder()
    merged_data[col] = le.fit_transform(merged_data[col])
    label_encoders[col] = le

# Define the features for model input
input_parameters = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 'CNT_CHILDREN',
    'FLAG_OWN_CAR', 'CODE_GENDER', 'DAYS_CREDIT', 'DAYS_DECISION', 'AMT_PAYMENT', 
    'AMT_INSTALMENT', 'AMT_APPLICATION'] + multi_category_columns

# Available columns based on the data
available_columns = [col for col in input_parameters if col in merged_data.columns]

# List of numerical features for scaling
numerical_features = [col for col in available_columns if col in [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_BALANCE', 'AMT_ANNUITY', 'SK_DPD', 
    'CNT_CHILDREN', 'DAYS_CREDIT', 'DAYS_DECISION', 'AMT_PAYMENT', 'AMT_INSTALMENT', 'AMT_APPLICATION']]

# Split data into features and target
training_data = merged_data[available_columns + ['TARGET']]
X = training_data[available_columns]
y = training_data['TARGET']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling numerical features
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# CatBoost Classifier model
catboost_model = CatBoostClassifier(
    iterations=1000, 
    depth=10, 
    learning_rate=0.05, 
    loss_function='Logloss', 
    cat_features=[i for i, col in enumerate(X_train.columns) if X_train[col].dtype == 'object'],
    verbose=100  # Adjust this as needed; set to 100 for less frequent logging
)
catboost_model.fit(X_train, y_train)

# Save directory setup
save_dir = 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\'
os.makedirs(save_dir, exist_ok=True)

# Function to calculate credit score (unchanged)
def calculate_credit_score(input_data):
    sk_dpd = input_data.get('SK_DPD', 0) 
    amt_payment = input_data.get('AMT_PAYMENT', 0)
    amt_installment = input_data.get('AMT_INSTALMENT', 0)
    
    if amt_installment > 0:
        payment_ratio = min(amt_payment / amt_installment, 1)
    else:
        payment_ratio = 1 
    payment_history_score = max(0, 1 - sk_dpd / 100) * 0.35
    
    amt_balance = input_data.get('AMT_BALANCE', 0)
    amt_credit = input_data.get('AMT_CREDIT', 1) 
    credit_utilization_ratio = amt_balance / amt_credit
    credit_utilization_score = max(0, 1 - credit_utilization_ratio) * 0.3
    
    days_credit = input_data.get('DAYS_CREDIT')
    days_decision = input_data.get('DAYS_DECISION')
    if days_credit is not None and days_decision is not None:
        credit_history_length = abs(days_credit - days_decision)
        length_of_credit_history_score = min(1, credit_history_length / 3650) * 0.15  
    else:
        length_of_credit_history_score = 0  
    
    name_contract_type = input_data.get('NAME_CONTRACT_TYPE', 'Unknown')
    credit_type_score = 0.1 if name_contract_type in ['Cash loans', 'Revolving loans'] else 0
    credit_mix_score = credit_type_score * 0.1 
    
    amt_application = input_data.get('AMT_APPLICATION', 0)
    new_credit_score = min(1, amt_application / 500000) * 0.1 
    
    total_credit_score = (payment_history_score + credit_utilization_score +
                          length_of_credit_history_score + credit_mix_score + new_credit_score) * 850 
    
    return int(total_credit_score)

# FICO range function (unchanged)
def determine_fico_range(credit_score):
    if credit_score >= 800:
        return "Exceptional"
    elif credit_score >= 740:
        return "Very Good"
    elif credit_score >= 670:
        return "Good"
    elif credit_score >= 580:
        return "Fair"
    else:
        return "Poor"
    
# Prediction function using CatBoost
def predict_default(input_data):
    input_data = {k: v for k, v in input_data.items() if k in available_columns}
    
    input_data['FLAG_OWN_CAR'] = binary_map.get(input_data.get('FLAG_OWN_CAR', 'N'), 0)
    input_data['CODE_GENDER'] = binary_map.get(input_data.get('CODE_GENDER', 'M'), 0)

    for col, le in label_encoders.items():
        if col in input_data:
            input_data[col] = le.transform([input_data[col]])[0]

    input_df = pd.DataFrame([input_data])
    input_df[numerical_features] = scaler.transform(input_df[numerical_features])

    is_defaulter = catboost_model.predict(input_df)[0]

    credit_score = calculate_credit_score(input_data)
    fico_range = determine_fico_range(credit_score)

    if fico_range in ['Poor', 'Fair']:
        is_defaulter = 1 

    return ("Defaulter" if is_defaulter else "Non-Defaulter"), credit_score, fico_range

# Example for prediction
input_example = {
    'AMT_INCOME_TOTAL': 50000,
    'AMT_CREDIT': 200000,
    'AMT_BALANCE': 150000,
    'AMT_ANNUITY': 15000,
    'SK_DPD': 5,
    'CNT_CHILDREN': 1,
    'FLAG_OWN_CAR': 'Y',
    'CODE_GENDER': 'M',
    'NAME_FAMILY_STATUS': 'Married',
    'NAME_INCOME_TYPE': 'Working',
    'NAME_HOUSING_TYPE': 'House / apartment',
    'NAME_CONTRACT_TYPE': 'Cash loans',
    'DAYS_CREDIT': -1000,
    'DAYS_DECISION': -500,
    'AMT_PAYMENT': 5000,
    'AMT_INSTALMENT': 4000,
    'AMT_APPLICATION': 250000
}

result = predict_default(input_example)
print(result)


0:	learn: 0.6391668	total: 864ms	remaining: 14m 23s
100:	learn: 0.2360166	total: 1m 10s	remaining: 10m 30s
200:	learn: 0.2214566	total: 2m 17s	remaining: 9m 4s
300:	learn: 0.2099055	total: 3m 26s	remaining: 8m
400:	learn: 0.1996075	total: 4m 43s	remaining: 7m 3s
500:	learn: 0.1913675	total: 5m 53s	remaining: 5m 51s
600:	learn: 0.1824031	total: 7m 5s	remaining: 4m 42s
700:	learn: 0.1753951	total: 8m 14s	remaining: 3m 30s
800:	learn: 0.1668866	total: 9m 23s	remaining: 2m 19s
900:	learn: 0.1607977	total: 11m 7s	remaining: 1m 13s
999:	learn: 0.1553940	total: 12m 16s	remaining: 0us
('Defaulter', 346, 'Poor')


In [5]:
import joblib

# Save model and scaler
joblib.dump(catboost_model, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\catboost_model_pipeline.pkl')
joblib.dump(scaler, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\scaler.pkl')

label_encoder_NAME_FAMILY_STATUS = LabelEncoder()
label_encoder_NAME_FAMILY_STATUS.fit(merged_data['NAME_FAMILY_STATUS'])

label_encoder_NAME_INCOME_TYPE = LabelEncoder()
label_encoder_NAME_INCOME_TYPE.fit(merged_data['NAME_INCOME_TYPE'])

label_encoder_NAME_HOUSING_TYPE = LabelEncoder()
label_encoder_NAME_HOUSING_TYPE.fit(merged_data['NAME_HOUSING_TYPE'])

label_encoder_NAME_CONTRACT_TYPE = LabelEncoder()
label_encoder_NAME_CONTRACT_TYPE.fit(merged_data['NAME_CONTRACT_TYPE'])

# Save each encoder
joblib.dump(label_encoder_NAME_FAMILY_STATUS, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_FAMILY_STATUS.pkl')
joblib.dump(label_encoder_NAME_INCOME_TYPE, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_INCOME_TYPE.pkl')
joblib.dump(label_encoder_NAME_HOUSING_TYPE, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_HOUSING_TYPE.pkl')
joblib.dump(label_encoder_NAME_CONTRACT_TYPE, 'C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_CONTRACT_TYPE.pkl')


['C:\\Users\\SMRUTI DESHPANDE\\house credit default\\label_encoder_NAME_CONTRACT_TYPE.pkl']

In [6]:
import sklearn
print(sklearn.__version__)

1.5.2
