In [None]:
"""
DATA DICTIONARY / COLUMN MEANINGS
---------------------------------

APPLICATION_RECORD.CSV (Features)
- ID: Client number
- CODE_GENDER: Gender (M/F)
- FLAG_OWN_CAR: Owns a car (Y/N)
- FLAG_OWN_REALTY: Owns a house (Y/N)
- CNT_CHILDREN: Number of children
- AMT_INCOME_TOTAL: Annual income
- NAME_INCOME_TYPE: Income category (Working, Pensioner, etc.)
- NAME_EDUCATION_TYPE: Education level
- NAME_FAMILY_STATUS: Marital status
- NAME_HOUSING_TYPE: Living situation (Rented, With parents, etc.)
- DAYS_BIRTH: Age in days (Counted backwards, e.g., -10000)
- DAYS_EMPLOYED: Employment duration (Backwards). Positive 365243 means Unemployed.
- FLAG_MOBIL, FLAG_WORK_PHONE, FLAG_PHONE, FLAG_EMAIL: Ways to Contact (1=Yes, 0=No)
- OCCUPATION_TYPE: Job title
- CNT_FAM_MEMBERS: Family size

CREDIT_RECORD.CSV (Target / Label Construction)
- ID: Client number (Key to join)
- MONTHS_BALANCE: Month of history (0 = current, -1 = last month)
- STATUS: Payment status for that month
    - C: Paid off / Closed
    - X: No loan
    - 0: 1-29 days past due
    - 1: 30-59 days past due (Warning sign)
    - 2, 3, 4, 5: >60 days overdue (Classify these as "Bad/Rejected" users)
"""

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
root = os.path.dirname(os.path.abspath(os.getcwd()))
data_dir = os.path.join(root, 'dataset')

In [4]:
app_rec = pd.read_csv(os.path.join(data_dir, 'application_record.csv'))
cred_rec = pd.read_csv(os.path.join(data_dir, 'credit_record.csv'))

In [5]:
app_rec.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [6]:
features = [
    'AGE',                  # High Importance
    'YEARS_EMPLOYED',       # High Importance
    'AMT_INCOME_TOTAL',     # Medium Importance (Context for debt)
    'FLAG_OWN_CAR',        # Medium Importance
    'FLAG_OWN_REALTY',     # Medium Importance
    'NAME_INCOME_TYPE',     # Low Importance
    'NAME_EDUCATION_TYPE',  # Low Importance
    'CNT_CHILDREN',         # Low Importance alone, but critical context for Income
    'CNT_FAM_MEMBERS',      # Similar to children
    'NAME_FAMILY_STATUS',   # Married people risk < Single people risk
    'NAME_HOUSING_TYPE',    # Home owners risk < Renters risk
    'OCCUPATION_TYPE'       # Job type matters
]

In [7]:
def classify_family(size):
    if size <= 1:
        return 'Singleton'
    elif size <= 4:
        return 'Small'
    else:
        return 'Large'


bins = [18, 30, 40, 60, 100]
labels = ['young_adults', 'adult', 'mid_age', 'senior']

app_rec['AGE'] = app_rec['DAYS_BIRTH'] / -365.25
app_rec['AGE'] = pd.cut(app_rec['AGE'], bins=bins, labels=labels)
app_rec['YEARS_EMPLOYED'] = app_rec['DAYS_EMPLOYED'] / -365.25
app_rec['FLAG_OWN_CAR'] = app_rec['FLAG_OWN_CAR'].map({'Y': 1, 'N': 0})
app_rec['TEMP'] = np.where(
    app_rec['CNT_CHILDREN'] > app_rec['CNT_FAM_MEMBERS'], # Condition
    app_rec['CNT_CHILDREN'] + 1,                          
    app_rec['CNT_FAM_MEMBERS']                            # Value if False (Keep valid data)
)
app_rec['FAMILY_MEM'] = app_rec['TEMP'].apply(classify_family)
app_rec['FLAG_OWN_REALTY'] = app_rec['FLAG_OWN_REALTY'].map({'Y': 1, 'N': 0})
app_rec['AMOT_INCOME_TOTAL'] = app_rec['AMT_INCOME_TOTAL'] / 10000 # Scale down for numerical stability

In [8]:
app_rec.drop(columns=['CODE_GENDER', 'CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS', 'TEMP'], inplace=True)

In [9]:
app_rec.head()

Unnamed: 0,ID,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,AGE,YEARS_EMPLOYED,FAMILY_MEM,AMOT_INCOME_TOTAL
0,5008804,1,1,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,,adult,12.435318,Small,42.75
1,5008805,1,1,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,,adult,12.435318,Small,42.75
2,5008806,1,1,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,mid_age,3.104723,Small,11.25
3,5008808,0,1,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,mid_age,8.353183,Singleton,27.0
4,5008809,0,1,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,mid_age,8.353183,Singleton,27.0


In [10]:
merged_data = pd.merge(app_rec, cred_rec, on='ID', how='inner')

In [11]:
bad_status = ['2', '3', '4', '5']

target_df = merged_data.groupby('ID')['STATUS'].apply(lambda x: 1 if any(i in bad_status for i in x) else 0).reset_index()
target_df.rename(columns={'STATUS': 'is_bad'}, inplace=True)
feature_df = merged_data.drop(columns=['MONTHS_BALANCE', 'STATUS']).drop_duplicates(subset='ID')

In [12]:
final_data = pd.merge(feature_df, target_df, on='ID')
final_data.to_csv(os.path.join(data_dir, 'merged_data.csv'), index=False)

In [13]:
y = final_data['is_bad']
X = final_data.drop(columns=['is_bad', 'ID'])

# 2. Tell the computer which columns are Text (Categorical) and which are Numbers
# We select any column that is of type 'object' (text)
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=['number']).columns.tolist()

print(f"Cat Columns: {cat_cols}")

preprocessor = ColumnTransformer(
    transformers = [
        ('num', SimpleImputer(strategy='median'), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

Cat Columns: ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'AGE', 'FAMILY_MEM']


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [38]:
print("score: ", model.score(X_test, y_test))
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

score:  0.6659352715304443
              precision    recall  f1-score   support

           0       0.99      0.67      0.80      7175
           1       0.02      0.50      0.05       117

    accuracy                           0.67      7292
   macro avg       0.51      0.59      0.42      7292
weighted avg       0.97      0.67      0.79      7292





In [14]:
feature_names = model.named_steps['preprocessor'].get_feature_names_out()

coeff = model.named_steps['classifier'].coef_[0]
intercept = model.named_steps['classifier'].intercept_[0]

score_card = pd.DataFrame({
        'feature': feature_names,
        'points': coeff
    })

score_card.sort_values(by='points', ascending=False)
print(f"BASE SCORE (Intercept): {intercept:.4f}")
print(score_card)


BASE SCORE (Intercept): 0.3003
                                              feature        points
0                                   num__FLAG_OWN_CAR -9.463254e-02
1                                num__FLAG_OWN_REALTY -3.751007e-01
2                               num__AMT_INCOME_TOTAL -1.100993e-07
3                                     num__FLAG_MOBIL  2.989481e-01
4                                num__FLAG_WORK_PHONE  8.988501e-02
5                                     num__FLAG_PHONE  3.313105e-02
6                                     num__FLAG_EMAIL -1.786138e-01
7                                 num__YEARS_EMPLOYED  1.410804e-03
8                              num__AMOT_INCOME_TOTAL -1.001260e-11
9          cat__NAME_INCOME_TYPE_Commercial associate -1.844990e-01
10                    cat__NAME_INCOME_TYPE_Pensioner  1.418206e+00
11                cat__NAME_INCOME_TYPE_State servant -7.028464e-01
12                      cat__NAME_INCOME_TYPE_Student -1.716013e-02
13               

In [40]:
def calculate_score(user_data):
    
    log_odds = 0.2983

    log_odds += user_data.get('AMT_INCOME_TOTAL', 0) * -0.00000035
    log_odds += user_data.get('YEARS_EMPLOYED', 0) * 0.001810

    adults = 1 # The applicant
    if user_data.get('FAMILY_STATUS') in ['Married', 'Civil marriage']:
        adults += 1
        
    total_size = adults + user_data.get('CNT_CHILDREN', 0)
    
    # Determine the Bin
    if total_size <= 1:
        fam_bin = 'Singleton'
    elif total_size <= 4:
        fam_bin = 'Small'
    else:
        fam_bin = 'Large'
        
    # Apply Weight for the Bin
    family_size_weights = {
        'Large': 0.6830,      # High Risk
        'Singleton': -0.4090, # Low Risk (Safety factor)
        'Small': 0.0228       # Neutral
    }
    log_odds += family_size_weights.get(fam_bin, 0)

    income_weights = {
        'Pensioner': 1.685,          # Very High Risk
        'State servant': -0.721,     # Very Safe
        'Working': -0.342,
        'Commercial associate': -0.300,
        'Student': -0.023
    }
    log_odds += income_weights.get(user_data.get('INCOME_TYPE'), 0)
    
    # Education
    edu_weights = {
        'Lower secondary': 0.441,    # Risky
        'Incomplete higher': 0.142,
        'Secondary / secondary special': -0.134,
        'Higher education': -0.069,
        'Academic degree': -0.083
    }
    log_odds += edu_weights.get(user_data.get('EDUCATION_TYPE'), 0)
    
    # Family Status
    family_status_weights = {
        'Widow': 0.827,              # High Risk
        'Single': 0.252,
        'Separated': -0.208,
        'Married': -0.273,           # Safe
        'Civil marriage': -0.300
    }
    log_odds += family_status_weights.get(user_data.get('FAMILY_STATUS'), 0)
    
    # Housing
    housing_weights = {
        'Municipal apartment': 0.517, # Risky
        'Office apartment': 0.384,
        'House / apartment': -0.101,  # Safe
        'Co-op apartment': -0.075,
        'Rented apartment': -0.055,
        'With parents': -0.372        # Very Safe
    }
    log_odds += housing_weights.get(user_data.get('HOUSING_TYPE'), 0)
    
    # Occupation (Selected)
    job_weights = {
        'Low-skill Laborers': 0.606,  # High Risk
        'High skill tech staff': 0.554,
        'Drivers': 0.460,
        'IT staff': 0.462,            # Surprisingly Risky in this model
        'Security staff': 0.394,
        'Core staff': 0.240,
        'Laborers': 0.061,
        'Accountants': 0.056,
        'Managers': 0.051,
        'Sales staff': -0.131,
        'Realty agents': -0.189,
        'Cooking staff': -0.387,
        'Cleaning staff': -0.465,
        'Medicine staff': -0.548,
        'Private service staff': -0.663
    }
    log_odds += job_weights.get(user_data.get('OCCUPATION_TYPE'), 0)
    
    # Age Group
    age_weights = {
        'young_adults': 0.224, # Riskiest
        'adult': 0.051,
        'mid_age': 0.010,
        'senior': 0.010        # Neutral
    }
    log_odds += age_weights.get(user_data.get('AGE_GROUP'), 0)

    prob_bad = 1 / (1 + np.exp(-log_odds))

    prob_good = 1 - prob_bad

    credit_score = 300 + (prob_good*550)
    if credit_score > 850:
        credit_score = 850

    return int(credit_score), prob_good

new_user = {
    'CNT_CHILDREN': 1,
    'AMT_INCOME_TOTAL': 1000000,
    'INCOME_TYPE': 'State servant',
    'EDUCATION_TYPE': 'Secondary / secondary special',
    'FAMILY_STATUS': 'Married',
    'HOUSING_TYPE': 'With parents',
    'AGE_GROUP': 'senior',
    'OCCUPATION_TYPE': 'Private service staff'
}

score, prob = calculate_score(new_user)
print(f"Credit Score: {score}")
print(f"Approval Chance: {prob:.1%}")

Credit Score: 794
Approval Chance: 89.9%
