In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from catboost import CatBoostClassifier

from tqdm import tqdm

import datetime

seed = 42

In [2]:
df = pd.read_csv("./data/students/students_train.csv", index_col="ID", parse_dates=["Дата_Рождения"])
df_test = pd.read_csv("./data/students/students_test.csv", index_col="ID", parse_dates=["Дата_Рождения"])

In [3]:
min_date = pd.to_datetime(datetime.datetime.strptime('01011900', "%d%m%Y").date())
df["year_of_birth"] = df["Дата_Рождения"].dt.year
df["birth"] = (df["Дата_Рождения"] - min_date).dt.days

In [4]:
df_test["year_of_birth"] = df_test["Дата_Рождения"].dt.year
df_test["birth"] = (df_test["Дата_Рождения"] - min_date).dt.days

In [5]:
df["Пособие"] = df["Пособие"].astype(str)
df["Изучаемый_Язык"] = df["Изучаемый_Язык"].astype(str)
df["Уч_Заведение"] = df["Уч_Заведение"].astype(str)
df["Уч_Заведение"] = df["Уч_Заведение"].astype(str)
df["Где_Находится_УЗ"] = df["Где_Находится_УЗ"].astype(str)
df["Страна_ПП"] = df["Страна_ПП"].astype(str)
df["Регион_ПП"] = df["Регион_ПП"].astype(str)
df["Город_ПП"] = df["Город_ПП"].astype(str)
df["Общежитие"] = df["Общежитие"].astype(str)
df["Иностранец"] = df["Иностранец"].astype(str)
df["Село"] = df["Село"].astype(str)
df["Страна_Родители"] = df["Страна_Родители"].astype(str)
df["Год_Окончания_УЗ"] = df["Год_Окончания_УЗ"].fillna(df["Год_Окончания_УЗ"].median())
df["Наличие_Матери"] = df["Наличие_Матери"].astype(str)
df["Наличие_Отца"] = df["Наличие_Отца"].astype(str)
df["Опекунство"] = df["Опекунство"].astype(str)
df["Пол"] = df["Пол"].astype(str)

In [6]:
df_test["Пособие"] = df_test["Пособие"].astype(str)
df_test["Изучаемый_Язык"] = df_test["Изучаемый_Язык"].astype(str)
df_test["Уч_Заведение"] = df_test["Уч_Заведение"].astype(str)
df_test["Уч_Заведение"] = df_test["Уч_Заведение"].astype(str)
df_test["Где_Находится_УЗ"] = df_test["Где_Находится_УЗ"].astype(str)
df_test["Страна_ПП"] = df_test["Страна_ПП"].astype(str)
df_test["Регион_ПП"] = df_test["Регион_ПП"].astype(str)
df_test["Город_ПП"] = df_test["Город_ПП"].astype(str)
df_test["Общежитие"] = df_test["Общежитие"].astype(str)
df_test["Иностранец"] = df_test["Иностранец"].astype(str)
df_test["Село"] = df_test["Село"].astype(str)
df_test["Страна_Родители"] = df_test["Страна_Родители"].astype(str)
df_test["Год_Окончания_УЗ"] = df_test["Год_Окончания_УЗ"].fillna(df["Год_Окончания_УЗ"].median())
df_test["Наличие_Матери"] = df_test["Наличие_Матери"].astype(str)
df_test["Наличие_Отца"] = df_test["Наличие_Отца"].astype(str)
df_test["Опекунство"] = df_test["Опекунство"].astype(str)
df_test["Пол"] = df_test["Пол"].astype(str)

In [7]:
df.isna().sum()

Код_группы          0
Год_Поступления     0
Пол                 0
Основания           0
Изучаемый_Язык      0
Дата_Рождения       0
Уч_Заведение        0
Где_Находится_УЗ    0
Год_Окончания_УЗ    0
Пособие             0
Страна_ПП           0
Регион_ПП           0
Город_ПП            0
Общежитие           0
Наличие_Матери      0
Наличие_Отца        0
Страна_Родители     0
Опекунство          0
Село                0
Иностранец          0
КодФакультета       0
СрБаллАттестата     0
Статус              0
year_of_birth       0
birth               0
dtype: int64

In [8]:
columns_to_drop = ["Дата_Рождения"]

In [9]:
X = df.drop(columns=["Статус"] + columns_to_drop)
y = df["Статус"]

In [10]:
X_test = df_test.drop(columns=columns_to_drop)

In [11]:
cat_features = list()

for col_name, dtype in X.dtypes.to_dict().items():
    if dtype == "object":
        cat_features.append(col_name)

cat_features

['Пол',
 'Основания',
 'Изучаемый_Язык',
 'Уч_Заведение',
 'Где_Находится_УЗ',
 'Пособие',
 'Страна_ПП',
 'Регион_ПП',
 'Город_ПП',
 'Общежитие',
 'Наличие_Матери',
 'Наличие_Отца',
 'Страна_Родители',
 'Опекунство',
 'Село',
 'Иностранец']

In [12]:
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
val_scores = list()
test_preds = np.zeros((X_test.shape[0], n_splits))

for i, (train_index, val_index) in enumerate(tqdm(skf.split(X, y))):
    print(f"=== FOLD {i + 1} ===")

    X_train = X.iloc[train_index]
    X_val = X.iloc[val_index]
    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]

    model = CatBoostClassifier(
        iterations=1500,
        cat_features=cat_features,
        eval_metric="F1",
        learning_rate=0.07,
        verbose=250,
        early_stopping_rounds=200,
        use_best_model=True,
        random_seed=seed,
        # depth=8,
    )
    model.fit(X_train, y_train, eval_set=(X_val, y_val))

    preds = model.predict(X_val)
    test_preds[:, i] = model.predict_proba(X_test)[:, 1]
    
    val_scores.append(f1_score(y_val, preds))

sum(val_scores) / len(val_scores)

0it [00:00, ?it/s]

=== FOLD 1 ===
0:	learn: 0.7346130	test: 0.7375691	best: 0.7375691 (0)	total: 61.5ms	remaining: 1m 32s
250:	learn: 0.8881522	test: 0.8727273	best: 0.8741894 (243)	total: 2.05s	remaining: 10.2s


1it [00:04,  4.16s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8771021992
bestIteration = 283

Shrink model to first 284 iterations.
=== FOLD 2 ===
0:	learn: 0.7427386	test: 0.7074468	best: 0.7074468 (0)	total: 9.65ms	remaining: 14.5s
250:	learn: 0.8859080	test: 0.8541935	best: 0.8575130 (227)	total: 1.99s	remaining: 9.9s
500:	learn: 0.9088838	test: 0.8700129	best: 0.8703466 (476)	total: 4s	remaining: 7.99s


2it [00:10,  5.16s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8729139923
bestIteration = 508

Shrink model to first 509 iterations.
=== FOLD 3 ===
0:	learn: 0.7048887	test: 0.7147059	best: 0.7147059 (0)	total: 8.07ms	remaining: 12.1s
250:	learn: 0.8897547	test: 0.8678756	best: 0.8690013 (236)	total: 1.99s	remaining: 9.89s
500:	learn: 0.9109501	test: 0.8769231	best: 0.8797954 (488)	total: 4.03s	remaining: 8.04s


3it [00:15,  5.43s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8797953964
bestIteration = 488

Shrink model to first 489 iterations.
=== FOLD 4 ===
0:	learn: 0.7299158	test: 0.7359781	best: 0.7359781 (0)	total: 9.71ms	remaining: 14.6s
250:	learn: 0.8825479	test: 0.8601864	best: 0.8632138 (234)	total: 2.02s	remaining: 10.1s


4it [00:19,  4.93s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8688741722
bestIteration = 289

Shrink model to first 290 iterations.
=== FOLD 5 ===
0:	learn: 0.7071197	test: 0.7121662	best: 0.7121662 (0)	total: 8.65ms	remaining: 13s
250:	learn: 0.8847926	test: 0.8766404	best: 0.8781127 (243)	total: 1.98s	remaining: 9.87s
500:	learn: 0.9114605	test: 0.8894668	best: 0.8894668 (500)	total: 4.06s	remaining: 8.1s
750:	learn: 0.9247856	test: 0.8860104	best: 0.8923476 (579)	total: 6.21s	remaining: 6.2s


5it [00:26,  5.54s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8923476005
bestIteration = 579

Shrink model to first 580 iterations.
=== FOLD 6 ===
0:	learn: 0.7003865	test: 0.7087087	best: 0.7087087 (0)	total: 8.27ms	remaining: 12.4s
250:	learn: 0.8890484	test: 0.8552972	best: 0.8567742 (224)	total: 2.03s	remaining: 10.1s
500:	learn: 0.9128031	test: 0.8633461	best: 0.8652118 (414)	total: 4.07s	remaining: 8.12s
750:	learn: 0.9304543	test: 0.8698980	best: 0.8730964 (657)	total: 6.28s	remaining: 6.26s
1000:	learn: 0.9403567	test: 0.8727735	best: 0.8753181 (864)	total: 8.44s	remaining: 4.21s


6it [00:35,  6.79s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8753180662
bestIteration = 864

Shrink model to first 865 iterations.
=== FOLD 7 ===
0:	learn: 0.7011609	test: 0.7161572	best: 0.7161572 (0)	total: 9.26ms	remaining: 13.9s
250:	learn: 0.8873543	test: 0.8567708	best: 0.8590078 (219)	total: 1.97s	remaining: 9.78s
500:	learn: 0.9131113	test: 0.8729140	best: 0.8729140 (473)	total: 4.1s	remaining: 8.17s
750:	learn: 0.9262330	test: 0.8743590	best: 0.8769231 (707)	total: 6.25s	remaining: 6.23s
1000:	learn: 0.9392250	test: 0.8772379	best: 0.8783611 (974)	total: 8.43s	remaining: 4.2s
1250:	learn: 0.9493355	test: 0.8772379	best: 0.8794872 (1149)	total: 10.7s	remaining: 2.13s


7it [00:47,  8.43s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8794871795
bestIteration = 1149

Shrink model to first 1150 iterations.
=== FOLD 8 ===
0:	learn: 0.7125220	test: 0.6960352	best: 0.6960352 (0)	total: 8.33ms	remaining: 12.5s
250:	learn: 0.8877082	test: 0.8525896	best: 0.8537234 (238)	total: 2.01s	remaining: 10s
500:	learn: 0.9116979	test: 0.8650066	best: 0.8661417 (472)	total: 4.26s	remaining: 8.5s
750:	learn: 0.9264727	test: 0.8660598	best: 0.8697917 (597)	total: 6.4s	remaining: 6.38s


8it [00:54,  7.96s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8697916667
bestIteration = 597

Shrink model to first 598 iterations.
=== FOLD 9 ===
0:	learn: 0.7040131	test: 0.7099567	best: 0.7099567 (0)	total: 8.47ms	remaining: 12.7s
250:	learn: 0.8867137	test: 0.8717949	best: 0.8754814 (228)	total: 1.97s	remaining: 9.81s
500:	learn: 0.9106383	test: 0.8812261	best: 0.8852041 (453)	total: 4.05s	remaining: 8.07s
750:	learn: 0.9233803	test: 0.8948035	best: 0.8959391 (709)	total: 6.15s	remaining: 6.13s


9it [01:02,  7.88s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8959390863
bestIteration = 709

Shrink model to first 710 iterations.
=== FOLD 10 ===
0:	learn: 0.7343606	test: 0.7341772	best: 0.7341772 (0)	total: 7.58ms	remaining: 11.4s
250:	learn: 0.8862655	test: 0.8609272	best: 0.8635762 (223)	total: 1.98s	remaining: 9.84s
500:	learn: 0.9103586	test: 0.8699080	best: 0.8740157 (350)	total: 4.01s	remaining: 8.01s


10it [01:06,  6.68s/it]

Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.874015748
bestIteration = 350

Shrink model to first 351 iterations.





0.8785585107282816

0.8784538098941148

In [13]:
test_preds = test_preds.mean(axis=1)

In [14]:
test_preds = (test_preds > 0.5).astype(int)

In [15]:
sample_submission = pd.read_csv("./data/students/students_sample_submission.csv", index_col="ID")
sample_submission["Статус"] = test_preds
sample_submission.to_csv("./submissions/students/sub1.csv")