<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/others/devfolio_oracleofdelphi/notebooks/03_final_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade xgboost

In [2]:
import os
import gc
import time
import warnings
import subprocess
gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [4]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    HAVE_GPU = True
except Exception:
    HAVE_GPU = False

print(f'GPU available: {HAVE_GPU}')

GPU available: False


# Data preparation

In [5]:
#Data
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/others/devfolio_oracleofdelphi/data'
train = pd.read_csv(f'{DATA_URL}/train.csv')
test = pd.read_csv(f'{DATA_URL}/test.csv')

In [6]:
train = train.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
test = test.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [7]:
TARGET = 'Credit Score'
label_mapping = {'Standard': 0, 'Good': 1, 'Best': 2}
train[TARGET] = train[TARGET].replace(label_mapping)

**Outlier removal**

In [8]:
train = train[train['Accounts of user'] <= 10]
train = train[train['Credit cards user have'] <= 10]
train = train[train['Interest Rate'] <= 35]
train = train[(train['Num_of_Loan'] < 10) & (train['Num_of_Loan'] >= 0)]
train = train[(train['Num_of_Delayed_Payment'] < 30) & (train['Num_of_Delayed_Payment'] >= 0)]
train = train[train['Changed_Credit_Limit'] >= 0]
train = train[train['Total_EMI_per_month'] < 1000]

train = train.reset_index(drop=True)

**Chosen feature set**

In [10]:
anova_features = ['Utlization Ratio', 'Credit Inquiries', 'Accounts of user', 
                  'Credit cards user have', 'Interest Rate', 'Num_of_Loan', 
                  'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Outstanding_Debt']

# Modeling

In [11]:
def custom_f1(ytrue, ypred):
    return -f1_score(ytrue, ypred, average='macro') #'-' to convert f1 to loss

In [12]:
def cross_val_predict(data, model, n_splits=5):
    scores = {
        'Fold': [str(i) for i in range(n_splits)],
        'F1-micro': [],
        'F1-macro': [],
        'F1-weighted': []
    }
    test_preds = {}
    
    X, y, X_test = data

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        val_preds = model.predict(X_val)
        test_preds[f'fold{fold}'] = model.predict(X_test)

        f1micro = f1_score(y_val, val_preds, average='micro')
        scores['F1-micro'].append(f1micro)
        f1macro = f1_score(y_val, val_preds, average='macro')
        scores['F1-macro'].append(f1macro)
        f1weight = f1_score(y_val, val_preds, average='weighted')
        scores['F1-weighted'].append(f1weight)
        _ = gc.collect()

    scores['Fold'].append('Avg.')
    for metric in ['F1-micro', 'F1-macro', 'F1-weighted']:
        mean_score = np.mean(scores[metric])
        scores[metric].append(mean_score)
    score_df = pd.DataFrame.from_dict(scores).set_index('Fold')
    display(score_df)

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int')
    return test_preds

In [14]:
#best params from notebook: 01-xgboost
model_params = {
    #base params
    'objective': 'multi:softmax',
    'n_estimators': 10000,
    'booster': 'gbtree',
    'eval_metric': custom_f1,
    'early_stopping_rounds': 100,
    'tree_method': 'gpu_hist' if HAVE_GPU else 'hist',
    'predictor': 'gpu_predictor' if HAVE_GPU else 'cpu_predictor',
    'enable_categorical': HAVE_GPU,
    'verbosity': 0,
    'seed': SEED,
    #tuned params
    'learning_rate': 0.175,
    'max_depth': 10,
    'min_child_weight': 4,
    'gamma': 0.1,
    'alpha': 1.8,
    'lambda': 0.06553376734028496,
    'subsample': 0.95,
    'colsample_bytree': 0.65,
    'colsample_bylevel': 0.9,
    'colsample_bynode': 0.85
}

In [15]:
%%time
test_preds = cross_val_predict(
    data=(train[anova_features], train[TARGET], test[anova_features]),
    model=XGBClassifier(**model_params)
)

Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.932171,0.904254,0.931448
1,0.921512,0.904187,0.921065
2,0.92345,0.906526,0.92244
3,0.908826,0.886765,0.908633
4,0.908826,0.8804,0.90774
Avg.,0.918957,0.896426,0.918265


CPU times: user 9.74 s, sys: 154 ms, total: 9.89 s
Wall time: 6.45 s


# Submission file

In [17]:
submission = pd.DataFrame()
submission[TARGET] = test_preds['mode'].replace({0: 'Standard', 1: 'Good', 2: 'Best'})
submission.to_csv('submission.csv', index=False)