In [1]:
import os
import gc
import optuna
import warnings
import numpy as np
from utils import *
import pandas as pd
from copy import deepcopy
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from train_models import get_model_scores
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report
warnings.filterwarnings('ignore')
%run skills.ipynb
%run languages.ipynb
%run work_experiences.ipynb
%run education.ipynb

class config:
    train_path = '../../../datasets/garanti-bbva-data-camp/train_users.csv'
    test_path = '../../../datasets/garanti-bbva-data-camp/test_users.csv'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/clean_skills_v2.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/clean_language.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/clean_education_v2.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 8

In [2]:
def fix_location(dataframe: pd.DataFrame, feature: str = 'location') -> pd.DataFrame:

    tr_cities = load_tr_cities()
    df_ = dataframe.copy()
    df_.loc[df_[feature].astype(str).str.contains('Kahraman Maras'), feature] = 'Kahramanmaras, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Şanliurfa'), feature] = 'Sanliurfa, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('İçel'), feature] = 'Mersin, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Afyon'), feature] = 'Afyonkarahisar, Turkey'
    df_[feature] = df_[feature].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_[feature] = df_[feature].apply(lambda x: x.upper().strip())
    df_[feature] = df_[feature].apply(lambda x: translation(str(x)))
    for city in tr_cities:
        df_[feature] = df_[feature].apply(lambda x: city if city in x else x)
    df_[f'{feature}_based_on_tr'] = df_[feature].apply(lambda x: 1 if x in tr_cities or x == 'TURKEY' else 0)
        
    return df_

In [3]:
def objective(trial):

    train_df = pd.read_csv(config.train_path)
    test_df = pd.read_csv(config.test_path)
    df = train_df.append(test_df).reset_index(drop = True)
    
    params = {
              'study_size': trial.suggest_int('study_size', 30, 90),
              'school_size': trial.suggest_int('school_size', 30, 90),
              'language_size': trial.suggest_int('language_size', 6, 15),
              'degree_size': trial.suggest_int('degree_size', 8, 30),
              'skill_size': trial.suggest_int('skill_size', 70, 180),
              'skill_exact_match': trial.suggest_categorical('skill_exact_match', [True, False]),
              'degree_exact_match': trial.suggest_categorical('degree_exact_match', [True, False]),
              'school_exact_match': trial.suggest_categorical('school_exact_match', [True, False]),
              'study_exact_match': trial.suggest_categorical('study_exact_match', [True, False]),
              }

    skills_df = load_skills(config.skills_path, params['skill_size'], exact_match=params['skill_exact_match'])
    lang_df = load_languages(config.languages_path, params['language_size'])
    school_df = load_school(config.education_path, params['school_size'], exact_match = params['school_exact_match'])
    degree_df = load_degree(config.education_path, params['degree_size'], exact_match = params['degree_exact_match'])
    study_df = load_study(config.education_path, params['study_size'], exact_match = params['study_exact_match'])
    exp_df = load_work_experiences(config.exp_path)

    df = fix_location(df)
    df = df.merge(skills_df, on = ['user_id'], how = 'left')
    df = df.merge(lang_df, on = ['user_id'], how = 'left')
    df = df.merge(school_df, on = ['user_id'], how = 'left')
    df = df.merge(degree_df, on = ['user_id'], how = 'left')
    df = df.merge(study_df, on = ['user_id'], how = 'left')
    df = df.merge(exp_df, on = ['user_id'], how = 'left')
    df = add_populations(df)

    df['nunique_company_by_industries'] = df.groupby(by = 'industry')['company_id'].transform('nunique')
    df['active_employees_by_companies'] = df.groupby(by = 'company_id')['user_id'].transform('nunique')
    df['nunique_industries_by_companies'] = df.groupby(by = 'company_id')['industry'].transform('nunique')
    
    target = 'moved_after_2019'
    cat_features = ['industry', 'location', 'company_id',
                    'employee_last_experience_year',
                    'employee_last_experience_month',
                    'employee_first_experience_year']
    drop_features = ['user_id']

    regex = re.compile(r"\[|\]|<", re.IGNORECASE)
    df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in df.columns.values]

    for caterogical_col in cat_features:
        df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

    train_set = df.loc[df[target].notnull()].drop(columns=drop_features, axis = 1)
    test_set = df.loc[df[target].isnull()].drop(columns=drop_features, axis = 1)

    X = train_set.drop(columns = [target], axis = 1)
    y = train_set[target]
    skf = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=config.seed)

    xgb_params = {'max_depth': 11, 'learning_rate': 0.015238768735012887,
                 'subsample': 0.8023883794058948, 'tree_method': 'hist',
                 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000,
                 'objective': 'binary:logistic'}

    scores = list()
    for idx, (train_ind, val_ind) in enumerate(skf.split(X, y)):
        print(f"fold: {idx+1}")
        model = XGBClassifier(**xgb_params)
        X_train = X.iloc[train_ind]
        y_train = y.iloc[train_ind]
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=500,
            verbose=False,
        )

        val_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, val_pred))
        del X_train, y_train, X_val, y_val
        gc.collect()
    del df, train_set, test_set
    gc.collect()

    return np.mean(scores)

In [4]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, gc_after_trial=True)

[32m[I 2023-02-25 07:54:32,463][0m A new study created in memory with name: no-name-76defa3a-6469-45cd-875c-75f886d615a6[0m
100%|██████████| 175/175 [00:52<00:00,  3.35it/s]
100%|██████████| 56/56 [00:01<00:00, 31.72it/s]
100%|██████████| 27/27 [00:00<00:00, 31.92it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 08:30:27,469][0m Trial 0 finished with value: 0.7890377675124141 and parameters: {'study_size': 66, 'school_size': 56, 'language_size': 14, 'degree_size': 27, 'skill_size': 175, 'skill_exact_match': True, 'degree_exact_match': True, 'school_exact_match': True, 'study_exact_match': False}. Best is trial 0 with value: 0.7890377675124141.[0m
100%|██████████| 90/90 [00:02<00:00, 31.50it/s]
100%|██████████| 8/8 [00:00<00:00, 31.73it/s]
100%|██████████| 64/64 [00:02<00:00, 31.97it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 09:07:20,480][0m Trial 1 finished with value: 0.7887925926689876 and parameters: {'study_size': 64, 'school_size': 90, 'language_size': 11, 'degree_size': 8, 'skill_size': 161, 'skill_exact_match': False, 'degree_exact_match': True, 'school_exact_match': True, 'study_exact_match': True}. Best is trial 0 with value: 0.7890377675124141.[0m
100%|██████████| 8/8 [00:00<00:00, 30.46it/s]
100%|██████████| 56/56 [00:01<00:00, 31.80it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 09:39:30,067][0m Trial 2 finished with value: 0.7882645526716796 and parameters: {'study_size': 56, 'school_size': 66, 'language_size': 8, 'degree_size': 8, 'skill_size': 85, 'skill_exact_match': False, 'degree_exact_match': True, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 0 with value: 0.7890377675124141.[0m
100%|██████████| 20/20 [00:00<00:00, 31.49it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 10:15:52,444][0m Trial 3 finished with value: 0.7895094086443792 and parameters: {'study_size': 73, 'school_size': 66, 'language_size': 11, 'degree_size': 20, 'skill_size': 165, 'skill_exact_match': False, 'degree_exact_match': True, 'school_exact_match': False, 'study_exact_match': False}. Best is trial 3 with value: 0.7895094086443792.[0m
100%|██████████| 77/77 [00:23<00:00,  3.33it/s]
100%|██████████| 23/23 [00:00<00:00, 31.58it/s]
100%|██████████| 82/82 [00:02<00:00, 31.88it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 10:47:37,566][0m Trial 4 finished with value: 0.788245619295399 and parameters: {'study_size': 82, 'school_size': 48, 'language_size': 13, 'degree_size': 23, 'skill_size': 77, 'skill_exact_match': True, 'degree_exact_match': True, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 3 with value: 0.7895094086443792.[0m
100%|██████████| 50/50 [00:01<00:00, 31.53it/s]
100%|██████████| 15/15 [00:00<00:00, 31.78it/s]
100%|██████████| 88/88 [00:02<00:00, 32.00it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 11:23:53,101][0m Trial 5 finished with value: 0.7890378585793073 and parameters: {'study_size': 88, 'school_size': 50, 'language_size': 14, 'degree_size': 15, 'skill_size': 166, 'skill_exact_match': False, 'degree_exact_match': True, 'school_exact_match': True, 'study_exact_match': True}. Best is trial 3 with value: 0.7895094086443792.[0m
100%|██████████| 12/12 [00:00<00:00, 30.91it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 11:58:39,960][0m Trial 6 finished with value: 0.789679188637693 and parameters: {'study_size': 89, 'school_size': 62, 'language_size': 6, 'degree_size': 12, 'skill_size': 161, 'skill_exact_match': False, 'degree_exact_match': True, 'school_exact_match': False, 'study_exact_match': False}. Best is trial 6 with value: 0.789679188637693.[0m
100%|██████████| 64/64 [00:02<00:00, 31.94it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 12:35:12,239][0m Trial 7 finished with value: 0.7899620367165322 and parameters: {'study_size': 64, 'school_size': 79, 'language_size': 14, 'degree_size': 24, 'skill_size': 134, 'skill_exact_match': False, 'degree_exact_match': False, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 7 with value: 0.7899620367165322.[0m
100%|██████████| 97/97 [00:29<00:00,  3.34it/s]
100%|██████████| 85/85 [00:02<00:00, 31.55it/s]
100%|██████████| 28/28 [00:00<00:00, 31.84it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 13:10:51,613][0m Trial 8 finished with value: 0.7882645754384028 and parameters: {'study_size': 72, 'school_size': 85, 'language_size': 14, 'degree_size': 28, 'skill_size': 97, 'skill_exact_match': True, 'degree_exact_match': True, 'school_exact_match': True, 'study_exact_match': False}. Best is trial 7 with value: 0.7899620367165322.[0m
100%|██████████| 27/27 [00:00<00:00, 31.56it/s]
100%|██████████| 32/32 [00:01<00:00, 31.93it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 13:44:19,539][0m Trial 9 finished with value: 0.7904146534053236 and parameters: {'study_size': 32, 'school_size': 68, 'language_size': 9, 'degree_size': 27, 'skill_size': 146, 'skill_exact_match': False, 'degree_exact_match': True, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 9 with value: 0.7904146534053236.[0m
100%|██████████| 32/32 [00:01<00:00, 31.91it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 14:15:56,875][0m Trial 10 finished with value: 0.7881702443661694 and parameters: {'study_size': 32, 'school_size': 30, 'language_size': 9, 'degree_size': 30, 'skill_size': 129, 'skill_exact_match': False, 'degree_exact_match': False, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 9 with value: 0.7904146534053236.[0m
100%|██████████| 49/49 [00:01<00:00, 31.97it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 14:49:41,799][0m Trial 11 finished with value: 0.7892075361223662 and parameters: {'study_size': 49, 'school_size': 77, 'language_size': 9, 'degree_size': 23, 'skill_size': 132, 'skill_exact_match': False, 'degree_exact_match': False, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 9 with value: 0.7904146534053236.[0m
100%|██████████| 40/40 [00:01<00:00, 31.53it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 15:22:56,680][0m Trial 12 finished with value: 0.7884153224510217 and parameters: {'study_size': 40, 'school_size': 76, 'language_size': 12, 'degree_size': 24, 'skill_size': 143, 'skill_exact_match': False, 'degree_exact_match': False, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 9 with value: 0.7904146534053236.[0m
100%|██████████| 51/51 [00:01<00:00, 31.44it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 15:55:49,318][0m Trial 13 finished with value: 0.7897168675647865 and parameters: {'study_size': 51, 'school_size': 76, 'language_size': 6, 'degree_size': 19, 'skill_size': 109, 'skill_exact_match': False, 'degree_exact_match': False, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 9 with value: 0.7904146534053236.[0m
100%|██████████| 31/31 [00:00<00:00, 31.69it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 16:29:59,285][0m Trial 14 finished with value: 0.7881513508316547 and parameters: {'study_size': 31, 'school_size': 81, 'language_size': 15, 'degree_size': 26, 'skill_size': 144, 'skill_exact_match': False, 'degree_exact_match': False, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 9 with value: 0.7904146534053236.[0m
100%|██████████| 114/114 [00:34<00:00,  3.34it/s]
100%|██████████| 43/43 [00:01<00:00, 31.79it/s]


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-25 17:02:24,102][0m Trial 15 finished with value: 0.7881891179798012 and parameters: {'study_size': 43, 'school_size': 70, 'language_size': 8, 'degree_size': 18, 'skill_size': 114, 'skill_exact_match': True, 'degree_exact_match': False, 'school_exact_match': False, 'study_exact_match': True}. Best is trial 9 with value: 0.7904146534053236.[0m
[33m[W 2023-02-25 17:02:30,157][0m Trial 16 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_44675/2923264329.py", line 19, in objective
    skills_df = load_skills(config.skills_path, params['skill_size'], exact_match=params['skill_exact_match'])
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_44675/2639614005.py", line 34, in load_skills
 

KeyboardInterrupt: 