In [1]:
import os
import gc
import optuna
import warnings
import numpy as np
from utils import *
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
warnings.filterwarnings('ignore')
%run skills.ipynb
%run languages.ipynb
%run work_experiences.ipynb
%run education.ipynb

class config:
    train_path = '../../../datasets/garanti-bbva-data-camp/train_users.csv'
    test_path = '../../../datasets/garanti-bbva-data-camp/test_users.csv'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/skills.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/languages.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/education.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 6

In [2]:
train_df = pd.read_csv(config.train_path)
test_df = pd.read_csv(config.test_path)
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')
df = train_df.append(test_df).reset_index(drop = True)
df.head()

train_df shape: (53019, 4)
test_df shape: (13255, 3)


Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1.0
1,6950,Internet,"Istanbul, Istanbul, Turkey",0.0
2,4880,Online Media,Turkey,0.0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0.0
4,11005,Banking,"Istanbul, Turkey",0.0


In [3]:
def fix_location(dataframe: pd.DataFrame) -> pd.DataFrame:

    df_ = dataframe.copy()
    df_.loc[df_['location'].astype(str).str.contains('Kahraman Maras'), 'location'] = 'Kahramanmaras, Turkey'
    df_.loc[df_['location'].astype(str).str.contains('Şanliurfa'), 'location'] = 'Sanliurfa, Turkey'
    df_.loc[df_['location'].astype(str).str.contains('İçel'), 'location'] = 'Mersin, Turkey'
    df_.loc[df_['location'].astype(str).str.contains('Afyon'), 'location'] = 'Afyonkarahisar, Turkey'
    df_['location'] = df_['location'].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_['location'] = df_['location'].apply(lambda x: x.upper().strip())
    df_['location'] = df_['location'].apply(lambda x: translation(str(x)))
    tr_cities = load_tr_cities()
    for city in tr_cities:
        df_['location'] = df_['location'].apply(lambda x: city if city in x else x)
        df_['based_on_tr'] = df_['location'].apply(lambda x: 1 if x in tr_cities or x == 'TURKEY' else 0)

    return df_

df = fix_location(df)
df.head()

Unnamed: 0,user_id,industry,location,moved_after_2019,based_on_tr
0,1301,Information Technology and Services,ISTANBUL,1.0,1
1,6950,Internet,ISTANBUL,0.0,1
2,4880,Online Media,TURKEY,0.0,1
3,26046,Telecommunications,ISTANBUL,0.0,1
4,11005,Banking,ISTANBUL,0.0,1


In [4]:
def objective(trial):

    df_ = df.copy()
    skill_size = trial.suggest_int('skill_size', 10, 60)
    lang_size = trial.suggest_int('lang_size', 4, 20)
    study_size = trial.suggest_int('study_size', 4, 40)
    degree_size = trial.suggest_int('degree_size', 4, 20)

    skills_df = load_skills(config.skills_path, skill_size)
    lang_df = load_languages(config.languages_path, lang_size)
    edu_df = load_education(config.education_path, study_size, degree_size)
    exp_df = load_work_experiences(config.exp_path)

    df_ = df_.merge(skills_df, on = ['user_id'], how = 'left')
    df_ = df_.merge(lang_df, on = ['user_id'], how = 'left')
    df_ = df_.merge(edu_df, on = ['user_id'], how = 'left')
    df_ = df_.merge(exp_df, on = ['user_id'], how = 'left')
    df_['nunique_company_by_industries'] = df_.groupby(by = 'industry')['company_id'].transform('nunique')

    target = 'moved_after_2019'
    cat_features = ['industry', 'location', 'company_id']
    drop_features = ['user_id']

    for caterogical_col in cat_features:
        df_[caterogical_col] = df_[caterogical_col].astype(str).astype("category")

    train_set = df_.loc[df_[target].notnull()].drop(columns=drop_features, axis = 1)
    test_set = df_.loc[df_[target].isnull()].drop(columns=drop_features, axis = 1)

    X = train_set.drop(columns = [target], axis = 1)
    y = train_set[target]

    params = {
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 100, 220),
        "depth": trial.suggest_int("depth", 5, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    params['eval_metric'] = 'Accuracy'
    params['cat_features'] = cat_features
    params['random_state'] = config.seed
    params['allow_writing_files'] = False
    params["iterations"] = 5000
    kf = KFold(n_splits=config.n_folds)

    scores = list()
    for idx, (train_ind, val_ind) in enumerate(kf.split(X, y)):
        print(f'fold: {idx+1}')
        model = CatBoostClassifier(**params)
        X_train = X.iloc[train_ind]
        y_train = y.iloc[train_ind]
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=400,
            verbose=False,
        )

        val_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, val_pred))
        del X_train, y_train, X_val, y_val
        gc.collect()

    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)

[32m[I 2023-02-08 03:07:32,231][0m A new study created in memory with name: no-name-ff7ba42e-4ceb-4971-8be3-0af0339461f9[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 52/52 [00:12<00:00,  4.31it/s]
100%|██████████| 7/7 [00:00<00:00,  7.86it/s]
100%|██████████| 20/20 [00:04<00:00,  4.43it/s]
100%|██████████| 13/13 [00:03<00:00,  4.17it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 04:31:31,956][0m Trial 0 finished with value: 0.7672720825724194 and parameters: {'skill_size': 52, 'lang_size': 7, 'study_size': 20, 'degree_size': 13, 'one_hot_max_size': 188, 'depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.6217375982247557}. Best is trial 0 with value: 0.7672720825724194.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 44/44 [00:10<00:00,  4.26it/s]
100%|██████████| 17/17 [00:02<00:00,  7.71it/s]
100%|██████████| 16/16 [00:03<00:00,  4.39it/s]
100%|██████████| 15/15 [00:03<00:00,  4.41it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 04:42:27,033][0m Trial 1 finished with value: 0.751108167234288 and parameters: {'skill_size': 44, 'lang_size': 17, 'study_size': 16, 'degree_size': 15, 'one_hot_max_size': 189, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.848732673249671}. Best is trial 0 with value: 0.7672720825724194.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 35/35 [00:08<00:00,  4.27it/s]
100%|██████████| 9/9 [00:01<00:00,  7.70it/s]
100%|██████████| 26/26 [00:05<00:00,  4.39it/s]
100%|██████████| 8/8 [00:01<00:00,  4.40it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 04:47:25,746][0m Trial 2 finished with value: 0.7380185030090519 and parameters: {'skill_size': 35, 'lang_size': 9, 'study_size': 26, 'degree_size': 8, 'one_hot_max_size': 125, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.15452537310241796}. Best is trial 0 with value: 0.7672720825724194.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 50/50 [00:11<00:00,  4.27it/s]
100%|██████████| 20/20 [00:02<00:00,  7.60it/s]
100%|██████████| 15/15 [00:03<00:00,  4.38it/s]
100%|██████████| 11/11 [00:02<00:00,  4.41it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 05:09:28,846][0m Trial 3 finished with value: 0.7558233384644065 and parameters: {'skill_size': 50, 'lang_size': 20, 'study_size': 15, 'degree_size': 11, 'one_hot_max_size': 122, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.4939542414842102}. Best is trial 0 with value: 0.7672720825724194.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 30/30 [00:07<00:00,  4.26it/s]
100%|██████████| 14/14 [00:01<00:00,  7.69it/s]
100%|██████████| 35/35 [00:07<00:00,  4.38it/s]
100%|██████████| 9/9 [00:02<00:00,  4.37it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 05:14:41,817][0m Trial 4 finished with value: 0.7370751993940726 and parameters: {'skill_size': 30, 'lang_size': 14, 'study_size': 35, 'degree_size': 9, 'one_hot_max_size': 202, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7672720825724194.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 60/60 [00:14<00:00,  4.26it/s]
100%|██████████| 14/14 [00:01<00:00,  7.74it/s]
100%|██████████| 21/21 [00:04<00:00,  4.39it/s]
100%|██████████| 15/15 [00:03<00:00,  4.40it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 05:51:51,367][0m Trial 5 finished with value: 0.7815689060558735 and parameters: {'skill_size': 60, 'lang_size': 14, 'study_size': 21, 'degree_size': 15, 'one_hot_max_size': 156, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 33/33 [00:07<00:00,  4.21it/s]
100%|██████████| 14/14 [00:01<00:00,  7.69it/s]
100%|██████████| 19/19 [00:04<00:00,  4.39it/s]
100%|██████████| 12/12 [00:02<00:00,  4.40it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 06:12:54,608][0m Trial 6 finished with value: 0.7387729111103449 and parameters: {'skill_size': 33, 'lang_size': 14, 'study_size': 19, 'degree_size': 12, 'one_hot_max_size': 135, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.964156136701722}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 19/19 [00:04<00:00,  4.25it/s]
100%|██████████| 15/15 [00:01<00:00,  7.70it/s]
100%|██████████| 29/29 [00:06<00:00,  4.40it/s]
100%|██████████| 12/12 [00:02<00:00,  4.41it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 06:43:24,728][0m Trial 7 finished with value: 0.775665301192315 and parameters: {'skill_size': 19, 'lang_size': 15, 'study_size': 29, 'degree_size': 12, 'one_hot_max_size': 134, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.43800154177783046}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 10/10 [00:02<00:00,  4.26it/s]
100%|██████████| 9/9 [00:01<00:00,  7.78it/s]
100%|██████████| 13/13 [00:02<00:00,  4.40it/s]
100%|██████████| 17/17 [00:03<00:00,  4.41it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 06:52:36,558][0m Trial 8 finished with value: 0.7447707289400562 and parameters: {'skill_size': 10, 'lang_size': 9, 'study_size': 13, 'degree_size': 17, 'one_hot_max_size': 184, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.6523789504611583}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 49/49 [00:11<00:00,  4.28it/s]
100%|██████████| 10/10 [00:01<00:00,  7.64it/s]
100%|██████████| 29/29 [00:06<00:00,  4.37it/s]
100%|██████████| 6/6 [00:01<00:00,  4.41it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 07:14:13,056][0m Trial 9 finished with value: 0.7689884003836633 and parameters: {'skill_size': 49, 'lang_size': 10, 'study_size': 29, 'degree_size': 6, 'one_hot_max_size': 156, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.591776688064094}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 60/60 [00:14<00:00,  4.26it/s]
100%|██████████| 5/5 [00:00<00:00,  7.68it/s]
100%|██████████| 4/4 [00:00<00:00,  4.35it/s]
100%|██████████| 18/18 [00:04<00:00,  4.40it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 07:33:01,122][0m Trial 10 finished with value: 0.7768724177613163 and parameters: {'skill_size': 60, 'lang_size': 5, 'study_size': 4, 'degree_size': 18, 'one_hot_max_size': 161, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 59/59 [00:13<00:00,  4.26it/s]
100%|██████████| 4/4 [00:00<00:00,  7.70it/s]
100%|██████████| 5/5 [00:01<00:00,  4.36it/s]
100%|██████████| 20/20 [00:04<00:00,  4.41it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 07:49:42,001][0m Trial 11 finished with value: 0.7770610614086255 and parameters: {'skill_size': 59, 'lang_size': 4, 'study_size': 5, 'degree_size': 20, 'one_hot_max_size': 162, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 56/56 [00:13<00:00,  4.28it/s]
100%|██████████| 4/4 [00:00<00:00,  7.39it/s]
100%|██████████| 4/4 [00:00<00:00,  4.34it/s]
100%|██████████| 20/20 [00:04<00:00,  4.38it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 08:18:56,797][0m Trial 12 finished with value: 0.7800222864176966 and parameters: {'skill_size': 56, 'lang_size': 4, 'study_size': 4, 'degree_size': 20, 'one_hot_max_size': 101, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 41/41 [00:09<00:00,  4.22it/s]
100%|██████████| 11/11 [00:01<00:00,  7.68it/s]
100%|██████████| 39/39 [00:08<00:00,  4.39it/s]
100%|██████████| 20/20 [00:04<00:00,  4.40it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 08:48:39,434][0m Trial 13 finished with value: 0.7777212287957752 and parameters: {'skill_size': 41, 'lang_size': 11, 'study_size': 39, 'degree_size': 20, 'one_hot_max_size': 101, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 55/55 [00:12<00:00,  4.29it/s]
100%|██████████| 18/18 [00:02<00:00,  7.61it/s]
100%|██████████| 11/11 [00:02<00:00,  4.38it/s]
100%|██████████| 16/16 [00:03<00:00,  4.40it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 09:18:50,617][0m Trial 14 finished with value: 0.7798336577116132 and parameters: {'skill_size': 55, 'lang_size': 18, 'study_size': 11, 'degree_size': 16, 'one_hot_max_size': 106, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 43/43 [00:10<00:00,  4.25it/s]
100%|██████████| 12/12 [00:01<00:00,  7.71it/s]
100%|██████████| 8/8 [00:01<00:00,  4.38it/s]
100%|██████████| 15/15 [00:03<00:00,  4.41it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 09:42:19,900][0m Trial 15 finished with value: 0.7788716583543079 and parameters: {'skill_size': 43, 'lang_size': 12, 'study_size': 8, 'degree_size': 15, 'one_hot_max_size': 219, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 25/25 [00:05<00:00,  4.28it/s]
100%|██████████| 7/7 [00:00<00:00,  7.73it/s]
100%|██████████| 25/25 [00:05<00:00,  4.40it/s]
100%|██████████| 18/18 [00:04<00:00,  4.40it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 10:00:24,578][0m Trial 16 finished with value: 0.7759671096834015 and parameters: {'skill_size': 25, 'lang_size': 7, 'study_size': 25, 'degree_size': 18, 'one_hot_max_size': 147, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 56/56 [00:13<00:00,  4.26it/s]
100%|██████████| 16/16 [00:02<00:00,  7.73it/s]
100%|██████████| 9/9 [00:02<00:00,  4.36it/s]
100%|██████████| 4/4 [00:00<00:00,  4.40it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 10:29:01,958][0m Trial 17 finished with value: 0.7761746561157365 and parameters: {'skill_size': 56, 'lang_size': 16, 'study_size': 9, 'degree_size': 4, 'one_hot_max_size': 115, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 47/47 [00:10<00:00,  4.29it/s]
100%|██████████| 7/7 [00:00<00:00,  7.63it/s]
100%|██████████| 35/35 [00:07<00:00,  4.41it/s]
100%|██████████| 14/14 [00:03<00:00,  4.41it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 10:54:25,201][0m Trial 18 finished with value: 0.7575019488907967 and parameters: {'skill_size': 47, 'lang_size': 7, 'study_size': 35, 'degree_size': 14, 'one_hot_max_size': 148, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9924456461214273}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 40/40 [00:09<00:00,  4.23it/s]
100%|██████████| 13/13 [00:01<00:00,  7.43it/s]
100%|██████████| 23/23 [00:05<00:00,  4.21it/s]
100%|██████████| 19/19 [00:04<00:00,  4.43it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-08 11:16:27,988][0m Trial 19 finished with value: 0.7776832866202313 and parameters: {'skill_size': 40, 'lang_size': 13, 'study_size': 23, 'degree_size': 19, 'one_hot_max_size': 176, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.7815689060558735.[0m


Frequency of top 20 skills before preprocess: 294433


[33m[W 2023-02-08 11:16:46,433][0m Trial 20 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_86990/660928086.py", line 9, in objective
    skills_df = load_skills(config.skills_path, skill_size)
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_86990/3064544183.py", line 245, in load_skills
    df_.loc[df_['skill'] == 'Postgresql', 'skill'] = 'PostgreSQL'
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 723, in __setitem__
    iloc._setitem_with_indexer(indexer, value, self.name)
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 1730, in _setitem_with_indexer
    self._setitem_with_indexer_split_

KeyboardInterrupt: 

In [5]:
study.best_params

{'skill_size': 60,
 'lang_size': 14,
 'study_size': 21,
 'degree_size': 15,
 'one_hot_max_size': 156,
 'depth': 12,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS'}

{'skill_size': 48, 'lang_size': 10, 'study_size': 9, 'degree_size': 11}