<a href="https://www.kaggle.com/code/sid4ds/ps-s4e6-00-data-overview-catboost-baseline?scriptVersionId=181094377" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Setup

In [1]:
import gc
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

pd.set_option('display.precision', 4)
pd.set_option('display.max_columns', None)

import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

SEED = 2024

In [2]:
# Check GPU availability
import subprocess
try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'GPU'
except Exception:
    DEVICE = 'CPU'

print(f'Available device: {DEVICE}')

Available device: GPU


In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e6'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

**Original dataset**

In [4]:
%%capture
!pip install ucimlrepo

In [5]:
from ucimlrepo import fetch_ucirepo

predict_students_dropout_and_academic_success = fetch_ucirepo(id=697)
X = predict_students_dropout_and_academic_success.data.features
y = predict_students_dropout_and_academic_success.data.targets

original = pd.concat([X, y], axis=1)

# Data overview

In [6]:
train.shape, test.shape, original.shape

((76518, 38), (51012, 37), (4424, 37))

In [7]:
train.sample(3)

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
47523,47523,1,1,1,9085,1,1,112.0,1,1,1,4,7,107.4,0,0,0,0,1,0,20,0,0,6,7,0,0.0,0,0,6,6,0,0.0,0,12.4,0.5,1.79,Dropout
40685,40685,1,39,1,9991,0,1,130.0,1,19,37,9,9,134.1,0,0,1,0,0,0,38,0,0,5,11,1,13.0,0,0,5,10,0,0.0,0,8.9,1.4,3.51,Dropout
9146,9146,1,17,1,9130,1,1,142.0,1,37,38,9,8,137.2,1,0,0,1,0,0,18,0,0,6,8,6,15.0,0,0,6,7,6,14.7143,0,15.5,2.8,-4.06,Graduate


In [8]:
original.sample()

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
4318,1,1,1,171,1,1,161.0,41,1,1,9,7,163.1,1,0,0,0,0,0,19,1,6,12,12,12,15.25,0,4,11,12,11,15.0,0,8.9,1.4,3.51,Graduate


In [9]:
train_index = train.pop('id')
test_index = test.pop('id')

In [10]:
train.columns == original.columns

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [11]:
original = original.rename({'Marital Status': 'Marital status'}, axis=1)

**Missing values**

In [12]:
train.isna().sum().sum(), test.isna().sum().sum(), original.isna().sum().sum()

(0, 0, 0)

**Duplicates**

In [13]:
train.duplicated().sum(), test.duplicated().sum(), original.duplicated().sum()

(0, 0, 0)

In [14]:
# all features + target
pd.concat([train, original], axis=0).duplicated().sum()

0

In [15]:
# without target
pd.concat([train, original], axis=0).duplicated(subset=test.columns).sum()

14

In [16]:
pd.concat([train, test], axis=0).duplicated(subset=test.columns).sum()

0

In [17]:
pd.concat([original, test], axis=0).duplicated(subset=test.columns).sum()

7

**Variable information**

In [18]:
predict_students_dropout_and_academic_success.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Marital Status,Feature,Integer,Marital Status,1 – single 2 – married 3 – widower 4 – divorce...,,no
1,Application mode,Feature,Integer,,1 - 1st phase - general contingent 2 - Ordinan...,,no
2,Application order,Feature,Integer,,Application order (between 0 - first choice; a...,,no
3,Course,Feature,Integer,,33 - Biofuel Production Technologies 171 - Ani...,,no
4,Daytime/evening attendance,Feature,Integer,,1 – daytime 0 - evening,,no
5,Previous qualification,Feature,Integer,Education Level,1 - Secondary education 2 - Higher education -...,,no
6,Previous qualification (grade),Feature,Continuous,,Grade of previous qualification (between 0 and...,,no
7,Nacionality,Feature,Integer,Nationality,1 - Portuguese; 2 - German; 6 - Spanish; 11 - ...,,no
8,Mother's qualification,Feature,Integer,Education Level,1 - Secondary Education - 12th Year of Schooli...,,no
9,Father's qualification,Feature,Integer,Education Level,1 - Secondary Education - 12th Year of Schooli...,,no


In [19]:
FEATURES = list(test.columns)

CAT_FEATURES = [
    'Marital status', 
    'Application mode', 
    'Course',
    'Daytime/evening attendance', 
    'Previous qualification', 
    'Nacionality', 
    'Mother\'s qualification', 
    'Father\'s qualification', 
    'Mother\'s occupation', 
    'Father\'s occupation', 
    'Displaced', 
    'Educational special needs', 
    'Debtor',
    'Tuition fees up to date', 
    'Gender', 
    'Scholarship holder', 
    'International'
]

**Target distribution**

In [20]:
TARGET = 'Target'
train[TARGET].value_counts(normalize=True)

Target
Graduate    0.4742
Dropout     0.3306
Enrolled    0.1952
Name: proportion, dtype: float64

In [21]:
original[TARGET].value_counts(normalize=True)

Target
Graduate    0.4993
Dropout     0.3212
Enrolled    0.1795
Name: proportion, dtype: float64

In [22]:
target_mapping = {'Graduate': 0, 'Enrolled': 1, 'Dropout': 2}
target_inverse = {v:k for k, v in target_mapping.items()}

train[TARGET] = train[TARGET].replace(target_mapping)
original[TARGET] = original[TARGET].replace(target_mapping)

# Modeling framework

In [23]:
def comp_metric(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [24]:
def custom_cv(features, model, folds=10, extend=False, seed=SEED, verbose=True):
    oof_preds = {}
    test_preds = {}
    
    X, y = train[features], train[TARGET]
    X_test = test[features]
    
    cat_feats = [f for f in features if f in CAT_FEATURES]
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        if extend: # original dataset added only to training folds
            X_train = pd.concat([X_train, original[features]], axis=0, ignore_index=True)
            y_train = pd.concat([y_train, original[TARGET]], axis=0, ignore_index=True)
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            cat_features=cat_feats,
            verbose=False)
        
        val_preds = model.predict(X_val).flatten()
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test).flatten()
        
        if verbose:
            score = comp_metric(y_val, val_preds)
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
            
        _ = gc.collect()
        
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0] # mode of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'OOF score: {comp_metric(y, oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [25]:
def create_submission_files(preds, config, notebook='00'):
    sub = sample_sub.copy()
    sub[TARGET] = preds['mode'].replace(target_inverse)
    sub.to_csv(f'nb{notebook}_{config}.csv', index=False)

# Experiments

In [26]:
BASE_PARAMS = {
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'metric_period': 1,
    'iterations': 5000,
    'learning_rate': 0.03,
    'early_stopping_rounds': 100,
    'use_best_model': True,
    'task_type': DEVICE,
    'thread_count': -1
}

if BASE_PARAMS['task_type'] == 'GPU':
    BASE_PARAMS['devices'] = '0:1'

In [27]:
op = {}  # OOF preds
tp = {}  # Test preds

In [28]:
%%time

dataset = 'TRN'
folds = 5
seed = SEED

config = f'data{dataset}_folds{folds}_seed{seed}'

model = cb.CatBoostClassifier(**BASE_PARAMS, random_seed=seed)

op[config], tp[config] = custom_cv(
    features=FEATURES,
    model=model,
    folds=folds)

create_submission_files(tp[config], config)

Fold # 0: 0.83272 ( 602 rounds)
Fold # 1: 0.82423 ( 468 rounds)
Fold # 2: 0.83057 ( 664 rounds)
Fold # 3: 0.83291 (1286 rounds)
Fold # 4: 0.83304 (1037 rounds)
OOF score: 0.83069

CPU times: user 3min 16s, sys: 1min 2s, total: 4min 19s
Wall time: 1min 38s


In [29]:
%%time

dataset = 'EXT'
folds = 5
seed = SEED

config = f'data{dataset}_folds{folds}_seed{seed}'

model = cb.CatBoostClassifier(**BASE_PARAMS, random_seed=seed)

op[config], tp[config] = custom_cv(
    features=FEATURES,
    model=model,
    folds=folds,
    extend=True)

create_submission_files(tp[config], config)

Fold # 0: 0.83312 ( 600 rounds)
Fold # 1: 0.82756 (1046 rounds)
Fold # 2: 0.83135 ( 776 rounds)
Fold # 3: 0.82886 ( 466 rounds)
Fold # 4: 0.83291 ( 816 rounds)
OOF score: 0.83076

CPU times: user 3min 7s, sys: 56.8 s, total: 4min 3s
Wall time: 1min 33s


In [30]:
%%time

dataset = 'TRN'
folds = 7
seed = SEED

config = f'data{dataset}_folds{folds}_seed{seed}'

model = cb.CatBoostClassifier(**BASE_PARAMS, random_seed=seed)

op[config], tp[config] = custom_cv(
    features=FEATURES,
    model=model,
    folds=folds)

create_submission_files(tp[config], config)

Fold # 0: 0.82894 ( 679 rounds)
Fold # 1: 0.83103 ( 513 rounds)
Fold # 2: 0.82518 ( 727 rounds)
Fold # 3: 0.83039 ( 260 rounds)
Fold # 4: 0.82508 ( 274 rounds)
Fold # 5: 0.82975 ( 188 rounds)
Fold # 6: 0.82902 ( 297 rounds)
OOF score: 0.82848

CPU times: user 2min 47s, sys: 51.3 s, total: 3min 38s
Wall time: 1min 24s


In [31]:
%%time

dataset = 'EXT'
folds = 7
seed = SEED

config = f'data{dataset}_folds{folds}_seed{seed}'

model = cb.CatBoostClassifier(**BASE_PARAMS, random_seed=seed)

op[config], tp[config] = custom_cv(
    features=FEATURES,
    model=model,
    folds=folds,
    extend=True)

create_submission_files(tp[config], config)

Fold # 0: 0.82702 ( 412 rounds)
Fold # 1: 0.83368 ( 817 rounds)
Fold # 2: 0.82298 ( 236 rounds)
Fold # 3: 0.83506 ( 815 rounds)
Fold # 4: 0.82710 ( 428 rounds)
Fold # 5: 0.83295 ( 445 rounds)
Fold # 6: 0.82948 ( 319 rounds)
OOF score: 0.82975

CPU times: user 3min 7s, sys: 57.9 s, total: 4min 5s
Wall time: 1min 33s


In [32]:
%%time

dataset = 'TRN'
folds = 15
seed = SEED

config = f'data{dataset}_folds{folds}_seed{seed}'

model = cb.CatBoostClassifier(**BASE_PARAMS, random_seed=seed)

op[config], tp[config] = custom_cv(
    features=FEATURES,
    model=model,
    folds=folds)

create_submission_files(tp[config], config)

Fold # 0: 0.82497 ( 434 rounds)
Fold # 1: 0.83183 ( 500 rounds)
Fold # 2: 0.84026 ( 606 rounds)
Fold # 3: 0.82396 ( 575 rounds)
Fold # 4: 0.82886 ( 376 rounds)
Fold # 5: 0.82258 ( 341 rounds)
Fold # 6: 0.82592 ( 432 rounds)
Fold # 7: 0.83631 ( 340 rounds)
Fold # 8: 0.82768 ( 335 rounds)
Fold # 9: 0.82827 ( 243 rounds)
Fold #10: 0.82729 ( 352 rounds)
Fold #11: 0.82945 ( 518 rounds)
Fold #12: 0.84180 (1221 rounds)
Fold #13: 0.83395 ( 305 rounds)
Fold #14: 0.82258 ( 263 rounds)
OOF score: 0.82971

CPU times: user 5min 53s, sys: 1min 44s, total: 7min 37s
Wall time: 2min 44s


In [33]:
%%time

dataset = 'EXT'
folds = 15
seed = SEED

config = f'data{dataset}_folds{folds}_seed{seed}'

model = cb.CatBoostClassifier(**BASE_PARAMS, random_seed=seed)

op[config], tp[config] = custom_cv(
    features=FEATURES,
    model=model,
    folds=folds,
    extend=True)

create_submission_files(tp[config], config)

Fold # 0: 0.82595 ( 537 rounds)
Fold # 1: 0.83261 ( 383 rounds)
Fold # 2: 0.84222 ( 619 rounds)
Fold # 3: 0.82807 (1318 rounds)
Fold # 4: 0.82788 ( 189 rounds)
Fold # 5: 0.82121 ( 232 rounds)
Fold # 6: 0.82729 ( 701 rounds)
Fold # 7: 0.83807 ( 565 rounds)
Fold # 8: 0.82690 ( 341 rounds)
Fold # 9: 0.83435 ( 646 rounds)
Fold #10: 0.82886 ( 513 rounds)
Fold #11: 0.83435 (1081 rounds)
Fold #12: 0.84023 ( 801 rounds)
Fold #13: 0.83474 ( 360 rounds)
Fold #14: 0.82396 ( 366 rounds)
OOF score: 0.83111

CPU times: user 7min 7s, sys: 2min 8s, total: 9min 15s
Wall time: 3min 17s


In [34]:
!head nb00_dataEXT_folds15_seed2024.csv

id,Target
76518,Dropout
76519,Graduate
76520,Graduate
76521,Enrolled
76522,Enrolled
76523,Graduate
76524,Graduate
76525,Graduate
76526,Dropout
