## Baseline

In [53]:
import pandas as pd
import numpy as np

In [54]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

train_df.head(10)

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1
5,5,50,170,55,51.0,1.2,1.2,1,1,146,...,31,99,15.9,1,0.7,24,42,119,1,1
6,6,45,160,55,69.0,1.5,1.2,1,1,150,...,69,122,13.0,1,0.7,17,12,16,0,0
7,7,55,155,60,84.5,0.7,0.9,1,1,137,...,51,198,14.5,1,0.7,16,15,16,0,0
8,8,40,165,70,89.0,0.7,1.0,1,1,130,...,59,150,15.7,1,0.9,24,21,31,0,1
9,9,40,155,50,73.0,1.5,1.5,1,1,105,...,55,122,13.2,1,0.7,22,16,14,0,0


In [55]:
target_col = "smoking"

target_col_vals = train_df[target_col]

test_ids_col = test_df["id"]

In [56]:
train_df = train_df.drop("id", axis=1)
test_df = test_df.drop("id", axis=1)

train_df = train_df.drop(target_col, axis=1)


In [57]:
categoric_cols = ["Urine protein", "hearing(left)", "hearing(right)"]
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.difference(categoric_cols)

### 1. Заполнение пропусков

In [58]:
print("Количество строк: ", train_df.shape[0])
print("Количество непустых значений в каждом столбце:")
print(train_df.count())

Количество строк:  159256
Количество непустых значений в каждом столбце:
age                    159256
height(cm)             159256
weight(kg)             159256
waist(cm)              159256
eyesight(left)         159256
eyesight(right)        159256
hearing(left)          159256
hearing(right)         159256
systolic               159256
relaxation             159256
fasting blood sugar    159256
Cholesterol            159256
triglyceride           159256
HDL                    159256
LDL                    159256
hemoglobin             159256
Urine protein          159256
serum creatinine       159256
AST                    159256
ALT                    159256
Gtp                    159256
dental caries          159256
dtype: int64


In [59]:
print("Количество строк: ", test_df.shape[0])
print("Количество непустых значений в каждом столбце:")
print(test_df.count())

Количество строк:  106171
Количество непустых значений в каждом столбце:
age                    106171
height(cm)             106171
weight(kg)             106171
waist(cm)              106171
eyesight(left)         106171
eyesight(right)        106171
hearing(left)          106171
hearing(right)         106171
systolic               106171
relaxation             106171
fasting blood sugar    106171
Cholesterol            106171
triglyceride           106171
HDL                    106171
LDL                    106171
hemoglobin             106171
Urine protein          106171
serum creatinine       106171
AST                    106171
ALT                    106171
Gtp                    106171
dental caries          106171
dtype: int64


Метод `count` выводит только непустые значения, и, так-как эти числа в обоих датасетах совпадают с количеством строк, следует, что пропусков в датасетах нет.

### 2. Обработка выбросов

Было решено для детекции использовать метод межквартального размаха, а обнаруженные выбросы заменить на медиану.

In [60]:
def detect_outliers_and_change_to_median(df, features):
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers_mask = (df[feature] < lower_bound) | (df[feature] > upper_bound)

        median = df[feature].median()
        df[feature] = df[feature].apply(lambda x: median if x < lower_bound or x > upper_bound else x)

        print(f"Количество выбросов в столбце '{feature}': {outliers_mask.sum()}")
        
    return df

train_df = detect_outliers_and_change_to_median(train_df, numeric_cols)
test_df = detect_outliers_and_change_to_median(test_df, numeric_cols)

Количество выбросов в столбце 'ALT': 6746
Количество выбросов в столбце 'AST': 6337
Количество выбросов в столбце 'Cholesterol': 430
Количество выбросов в столбце 'Gtp': 8792
Количество выбросов в столбце 'HDL': 1932
Количество выбросов в столбце 'LDL': 495
Количество выбросов в столбце 'age': 682
Количество выбросов в столбце 'dental caries': 31532
Количество выбросов в столбце 'eyesight(left)': 2324
Количество выбросов в столбце 'eyesight(right)': 2493
Количество выбросов в столбце 'fasting blood sugar': 8160
Количество выбросов в столбце 'height(cm)': 463
Количество выбросов в столбце 'hemoglobin': 1493
Количество выбросов в столбце 'relaxation': 981
Количество выбросов в столбце 'serum creatinine': 5943
Количество выбросов в столбце 'systolic': 1592
Количество выбросов в столбце 'triglyceride': 3087
Количество выбросов в столбце 'waist(cm)': 858
Количество выбросов в столбце 'weight(kg)': 2296
Количество выбросов в столбце 'ALT': 3864
Количество выбросов в столбце 'AST': 4272
Колич

### 3. Генерация признаков

Объединим рост и вес в один признак - индекс массы тела `BMI` 

In [61]:
train_df['BMI'] = train_df['weight(kg)'] / ((train_df['height(cm)']/100) ** 2)
test_df['BMI'] = test_df['weight(kg)'] / ((test_df['height(cm)']/100) ** 2)

numeric_cols = numeric_cols.append(pd.Index(['BMI']))

train_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,BMI
0,55.0,165.0,60.0,81.0,0.5,0.6,1,1,135.0,87.0,...,40.0,75.0,16.5,1,1.0,22.0,25.0,27.0,0.0,22.038567
1,70.0,165.0,65.0,89.0,0.6,0.7,2,2,146.0,83.0,...,57.0,126.0,16.2,1,1.1,27.0,23.0,37.0,0.0,23.875115
2,20.0,170.0,75.0,81.0,0.4,0.5,1,1,118.0,75.0,...,45.0,93.0,17.4,1,0.8,27.0,31.0,53.0,0.0,25.951557
3,35.0,180.0,95.0,105.0,1.5,1.2,1,1,131.0,88.0,...,38.0,102.0,15.9,1,1.0,20.0,27.0,30.0,0.0,29.320988
4,30.0,165.0,60.0,80.5,1.5,1.0,1,1,121.0,76.0,...,44.0,93.0,15.4,1,0.8,19.0,13.0,17.0,0.0,22.038567


### 4. Масштабирование

In [62]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

train_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,BMI
0,0.936081,-0.037424,-0.560828,-0.21534,-1.724797,-1.382283,1,1,1.05958,1.175633,...,-1.166219,-1.542069,1.237202,1,0.69991,-0.379508,0.064601,-0.222814,0.0,-0.692192
1,2.228313,-0.037424,-0.137391,0.701246,-1.382271,-1.03743,2,2,1.967077,0.716709,...,0.128186,0.457422,1.012941,1,1.347447,0.422286,-0.121778,0.389268,0.0,-0.122192
2,-2.079127,0.536465,0.709483,-0.21534,-2.067324,-1.727136,1,1,-0.342916,-0.201139,...,-0.785511,-0.836366,1.909983,1,-0.595162,0.422286,0.623741,1.3686,0.0,0.522264
3,-0.786895,1.684242,2.403229,2.53442,1.700463,0.686838,1,1,0.729581,1.290364,...,-1.318502,-0.483515,0.788681,1,0.69991,-0.700225,0.250981,-0.039189,0.0,1.568017
4,-1.217639,-0.037424,-0.560828,-0.272627,1.700463,-0.002869,1,1,-0.095416,-0.086408,...,-0.861653,-0.836366,0.414914,1,-0.595162,-0.860584,-1.053678,-0.834897,0.0,-0.692192


### 5. Кодирование категоральных признаков

In [63]:
def one_hot_encode(df, features):
    df_encoded = pd.get_dummies(df, columns=features, drop_first=True)
    return df_encoded

train_df = one_hot_encode(train_df, categoric_cols)
test_df = one_hot_encode(test_df, categoric_cols)

train_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),systolic,relaxation,fasting blood sugar,Cholesterol,...,Gtp,dental caries,BMI,Urine protein_2,Urine protein_3,Urine protein_4,Urine protein_5,Urine protein_6,hearing(left)_2,hearing(right)_2
0,0.936081,-0.037424,-0.560828,-0.21534,-1.724797,-1.382283,1.05958,1.175633,-0.22183,-0.845966,...,-0.222814,0.0,-0.692192,False,False,False,False,False,False,False
1,2.228313,-0.037424,-0.137391,0.701246,-1.382271,-1.03743,1.967077,0.716709,-0.01039,-0.05857,...,0.389268,0.0,-0.122192,False,False,False,False,False,True,True
2,-2.079127,0.536465,0.709483,-0.21534,-2.067324,-1.727136,-0.342916,-0.201139,-1.807627,-0.631222,...,1.3686,0.0,0.522264,False,False,False,False,False,False,False
3,-0.786895,1.684242,2.403229,2.53442,1.700463,0.686838,0.729581,1.290364,-0.538989,-0.55964,...,-0.039189,0.0,1.568017,False,False,False,False,False,False,False
4,-1.217639,-0.037424,-0.560828,-0.272627,1.700463,-0.002869,-0.095416,-0.086408,-0.538989,-1.454409,...,-0.834897,0.0,-0.692192,False,False,False,False,False,False,False


### 6. Обучение модели

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold

In [68]:
X = train_df
y = target_col_vals

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 2000),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": 0,
        "eval_metric": "Accuracy",
        "task_type": "CPU",
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

        train_pool = Pool(X_train_fold, y_train_fold)
        valid_pool = Pool(X_valid_fold, y_valid_fold)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, use_best_model=True)
        
        preds = model.predict(X_valid_fold)
        acc = np.mean(preds == y_valid_fold)
        scores.append(acc)

    return np.mean(scores)

In [69]:
import optuna

In [71]:
study = optuna.create_study(direction="maximize", study_name="baseline_smoking_prediction")
study.optimize(objective, n_trials=3, show_progress_bar=True)

best_trial = study.best_trial
print(f"Best score: {best_trial.value}")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

[I 2024-12-09 22:44:17,554] A new study created in memory with name: baseline_smoking_prediction
Best trial: 0. Best value: 0.77248:  33%|███▎      | 1/3 [03:55<07:51, 235.53s/it]

[I 2024-12-09 22:48:13,078] Trial 0 finished with value: 0.7724795503195245 and parameters: {'iterations': 1699, 'depth': 10, 'learning_rate': 0.014756045377155469, 'l2_leaf_reg': 7.264906420990169, 'random_strength': 8.449462783843709, 'border_count': 249}. Best is trial 0 with value: 0.7724795503195245.


Best trial: 0. Best value: 0.77248:  67%|██████▋   | 2/3 [04:41<02:04, 124.18s/it]

[I 2024-12-09 22:48:59,318] Trial 1 finished with value: 0.7569260338799768 and parameters: {'iterations': 1124, 'depth': 3, 'learning_rate': 0.008459994843350253, 'l2_leaf_reg': 2.268933607030423, 'random_strength': 1.2794073079261121, 'border_count': 67}. Best is trial 0 with value: 0.7724795503195245.


Best trial: 0. Best value: 0.77248: 100%|██████████| 3/3 [05:09<00:00, 103.24s/it]

[I 2024-12-09 22:49:27,260] Trial 2 finished with value: 0.7609886048631888 and parameters: {'iterations': 190, 'depth': 7, 'learning_rate': 0.00624432862897611, 'l2_leaf_reg': 5.623042725999702, 'random_strength': 0.0020388413130804574, 'border_count': 106}. Best is trial 0 with value: 0.7724795503195245.
Best score: 0.7724795503195245
    iterations: 1699
    depth: 10
    learning_rate: 0.014756045377155469
    l2_leaf_reg: 7.264906420990169
    random_strength: 8.449462783843709
    border_count: 249





In [74]:
final_model = CatBoostClassifier(**best_trial.params, verbose=1)
train_pool = Pool(X, y)
final_model.fit(train_pool)

X_test = test_df

test_preds = final_model.predict(X_test)

test_preds = test_preds.astype(int)

submission = pd.DataFrame({
    "id": test_ids_col,
    "smoking": test_preds
})
submission.to_csv("data/submissions.csv", index=False)

print("Submission file created: data/submissions.csv")

0:	learn: 0.6856515	total: 71.7ms	remaining: 2m 1s
1:	learn: 0.6786073	total: 147ms	remaining: 2m 4s
2:	learn: 0.6716087	total: 220ms	remaining: 2m 4s
3:	learn: 0.6661571	total: 247ms	remaining: 1m 44s
4:	learn: 0.6599249	total: 328ms	remaining: 1m 51s
5:	learn: 0.6540012	total: 401ms	remaining: 1m 53s
6:	learn: 0.6483231	total: 469ms	remaining: 1m 53s
7:	learn: 0.6431753	total: 544ms	remaining: 1m 55s
8:	learn: 0.6378188	total: 626ms	remaining: 1m 57s
9:	learn: 0.6328892	total: 712ms	remaining: 2m
10:	learn: 0.6279070	total: 813ms	remaining: 2m 4s
11:	learn: 0.6233105	total: 925ms	remaining: 2m 10s
12:	learn: 0.6181367	total: 1.01s	remaining: 2m 11s
13:	learn: 0.6135306	total: 1.1s	remaining: 2m 12s
14:	learn: 0.6093177	total: 1.18s	remaining: 2m 12s
15:	learn: 0.6055315	total: 1.25s	remaining: 2m 12s
16:	learn: 0.6017954	total: 1.34s	remaining: 2m 12s
17:	learn: 0.5982933	total: 1.41s	remaining: 2m 11s
18:	learn: 0.5947429	total: 1.49s	remaining: 2m 11s
19:	learn: 0.5912710	total: 1.