# Setup

In [1]:
import gc
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

pd.set_option('display.precision', 4)
pd.set_option('display.max_columns', None)

import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

SEED = 2024

In [2]:
# Check GPU availability
import subprocess
try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'GPU'
except Exception:
    DEVICE = 'CPU'

print(f'Available device: {DEVICE}')

Available device: GPU


In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e6'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

**Original dataset**

In [4]:
%%capture
!pip install ucimlrepo

In [5]:
from ucimlrepo import fetch_ucirepo

predict_students_dropout_and_academic_success = fetch_ucirepo(id=697)
X = predict_students_dropout_and_academic_success.data.features
y = predict_students_dropout_and_academic_success.data.targets

original = pd.concat([X, y], axis=1)

# Data overview

In [6]:
train.shape, test.shape, original.shape

((76518, 38), (51012, 37), (4424, 37))

In [7]:
train.sample(3)

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
20721,20721,1,39,1,9003,1,1,133.1,1,19,19,9,7,134.0,0,0,0,1,1,0,29,0,0,6,16,4,12.0,0,0,6,15,4,10.75,0,16.2,0.3,-0.92,Enrolled
51396,51396,1,17,1,9147,1,1,142.0,1,1,1,4,5,127.0,1,0,0,0,1,0,18,0,0,5,10,3,12.4,0,0,5,7,5,10.8333,0,13.9,-0.3,0.79,Dropout
1836,1836,1,1,1,9500,1,1,126.0,1,19,37,9,8,118.7,0,0,0,1,1,0,19,0,0,7,8,0,0.0,0,0,7,9,0,0.0,0,13.9,-0.3,0.79,Dropout


In [8]:
original.sample()

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
4052,1,1,1,9254,1,1,122.0,1,37,37,9,9,114.3,1,0,0,0,0,1,20,0,0,6,12,4,11.25,0,0,6,12,3,11.3333,0,7.6,2.6,0.32,Dropout


In [9]:
train_index = train.pop('id')
test_index = test.pop('id')

In [10]:
train.columns == original.columns

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [11]:
original = original.rename({'Marital Status': 'Marital status'}, axis=1)

**Missing values**

In [12]:
train.isna().sum().sum(), test.isna().sum().sum(), original.isna().sum().sum()

(0, 0, 0)

**Duplicates**

In [13]:
train.duplicated().sum(), test.duplicated().sum(), original.duplicated().sum()

(0, 0, 0)

In [14]:
# all features + target
pd.concat([train, original], axis=0).duplicated().sum()

0

In [15]:
# without target
pd.concat([train, original], axis=0).duplicated(subset=test.columns).sum()

14

In [16]:
pd.concat([train, test], axis=0).duplicated(subset=test.columns).sum()

0

In [17]:
pd.concat([original, test], axis=0).duplicated(subset=test.columns).sum()

7

**Variable information**

In [18]:
predict_students_dropout_and_academic_success.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Marital Status,Feature,Integer,Marital Status,1 – single 2 – married 3 – widower 4 – divorce...,,no
1,Application mode,Feature,Integer,,1 - 1st phase - general contingent 2 - Ordinan...,,no
2,Application order,Feature,Integer,,Application order (between 0 - first choice; a...,,no
3,Course,Feature,Integer,,33 - Biofuel Production Technologies 171 - Ani...,,no
4,Daytime/evening attendance,Feature,Integer,,1 – daytime 0 - evening,,no
5,Previous qualification,Feature,Integer,Education Level,1 - Secondary education 2 - Higher education -...,,no
6,Previous qualification (grade),Feature,Continuous,,Grade of previous qualification (between 0 and...,,no
7,Nacionality,Feature,Integer,Nationality,1 - Portuguese; 2 - German; 6 - Spanish; 11 - ...,,no
8,Mother's qualification,Feature,Integer,Education Level,1 - Secondary Education - 12th Year of Schooli...,,no
9,Father's qualification,Feature,Integer,Education Level,1 - Secondary Education - 12th Year of Schooli...,,no


In [19]:
FEATURES = list(test.columns)

CAT_FEATURES = [
    'Marital status', 
    'Application mode', 
    'Course',
    'Daytime/evening attendance', 
    'Previous qualification', 
    'Nacionality', 
    'Mother\'s qualification', 
    'Father\'s qualification', 
    'Mother\'s occupation', 
    'Father\'s occupation', 
    'Displaced', 
    'Educational special needs', 
    'Debtor',
    'Tuition fees up to date', 
    'Gender', 
    'Scholarship holder', 
    'International'
]

**Target distribution**

In [20]:
TARGET = 'Target'
train[TARGET].value_counts(normalize=True)

Target
Graduate    0.4742
Dropout     0.3306
Enrolled    0.1952
Name: proportion, dtype: float64

In [21]:
original[TARGET].value_counts(normalize=True)

Target
Graduate    0.4993
Dropout     0.3212
Enrolled    0.1795
Name: proportion, dtype: float64

In [22]:
target_mapping = {'Graduate': 0, 'Enrolled': 1, 'Dropout': 2}
target_inverse = {v:k for k, v in target_mapping.items()}

train[TARGET] = train[TARGET].replace(target_mapping)
original[TARGET] = original[TARGET].replace(target_mapping)

# Modeling framework

In [23]:
def comp_metric(y_true, y_pred):
    return accuracy_score(y_true, y_pred)

In [24]:
def custom_cv(features, model, folds=10, extend=False, seed=SEED, verbose=True):
    oof_preds = {}
    test_preds = {}
    
    X, y = train[features], train[TARGET]
    X_test = test[features]
    
    cat_feats = [f for f in features if f in CAT_FEATURES]
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        if extend: # original dataset added only to training folds
            X_train = pd.concat([X_train, original[features]], axis=0, ignore_index=True)
            y_train = pd.concat([y_train, original[TARGET]], axis=0, ignore_index=True)
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            cat_features=cat_feats,
            verbose=False)
        
        val_preds = model.predict(X_val).flatten()
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test).flatten()
        
        if verbose:
            score = comp_metric(y_val, val_preds)
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
            
        _ = gc.collect()
        
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0] # mode of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'OOF score: {comp_metric(y, oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [25]:
def create_submission_files(preds, config, notebook='00'):
    sub = sample_sub.copy()
    sub[TARGET] = preds['mode'].replace(target_inverse)
    sub.to_csv(f'nb{notebook}_{config}.csv', index=False)

In [26]:
BASE_PARAMS = {
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'metric_period': 1,
    'iterations': 5000,
    'learning_rate': 0.03,
    'early_stopping_rounds': 100,
    'use_best_model': True,
    'task_type': DEVICE,
    'thread_count': -1
}

if BASE_PARAMS['task_type'] == 'GPU':
    BASE_PARAMS['devices'] = '0:1'

In [27]:
op = {}  # OOF preds
tp = {}  # Test preds

In [28]:
%%time

dataset = 'TRN'
folds = 10
seed = SEED

config = f'data{dataset}_folds{folds}_seed{seed}'

model = cb.CatBoostClassifier(**BASE_PARAMS, random_seed=seed)

op[config], tp[config] = custom_cv(
    features=FEATURES,
    model=model,
    folds=folds)

create_submission_files(tp[config], config)

Fold # 0: 0.82619 ( 510 rounds)
Fold # 1: 0.83926 ( 565 rounds)
Fold # 2: 0.82671 ( 820 rounds)
Fold # 3: 0.82266 ( 150 rounds)
Fold # 4: 0.83089 ( 705 rounds)
Fold # 5: 0.83259 (1050 rounds)
Fold # 6: 0.82972 (1062 rounds)
Fold # 7: 0.83050 ( 301 rounds)
Fold # 8: 0.83440 ( 390 rounds)
Fold # 9: 0.82826 ( 560 rounds)
OOF score: 0.83012

CPU times: user 5min 4s, sys: 1min 36s, total: 6min 41s
Wall time: 2min 26s


In [29]:
%%time

dataset = 'EXT'
folds = 10
seed = SEED

config = f'data{dataset}_folds{folds}_seed{seed}'

model = cb.CatBoostClassifier(**BASE_PARAMS, random_seed=seed)

op[config], tp[config] = custom_cv(
    features=FEATURES,
    model=model,
    folds=folds,
    extend=True)

create_submission_files(tp[config], config)

Fold # 0: 0.82514 ( 346 rounds)
Fold # 1: 0.83860 ( 417 rounds)
Fold # 2: 0.82684 ( 685 rounds)
Fold # 3: 0.82684 ( 948 rounds)
Fold # 4: 0.82880 ( 368 rounds)
Fold # 5: 0.82815 ( 285 rounds)
Fold # 6: 0.82972 ( 733 rounds)
Fold # 7: 0.82998 ( 219 rounds)
Fold # 8: 0.83518 ( 478 rounds)
Fold # 9: 0.82813 ( 398 rounds)
OOF score: 0.82974

CPU times: user 4min 22s, sys: 1min 19s, total: 5min 41s
Wall time: 2min 6s


In [30]:
!head nb00_dataTRN_folds10_seed2024.csv

id,Target
76518,Dropout
76519,Graduate
76520,Graduate
76521,Enrolled
76522,Enrolled
76523,Graduate
76524,Graduate
76525,Graduate
76526,Dropout
