In [1]:
import os
import gc
import optuna
import warnings
import numpy as np
from utils import *
import pandas as pd
from copy import deepcopy
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from train_models import get_model_scores
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report
warnings.filterwarnings('ignore')
%run skills.ipynb
%run languages.ipynb
%run work_experiences.ipynb
%run education.ipynb

class config:
    train_path = '../../../datasets/garanti-bbva-data-camp/train_users.csv'
    test_path = '../../../datasets/garanti-bbva-data-camp/test_users.csv'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/clean_skills_v2.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/clean_language.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/clean_education_v2.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 8

In [2]:
train_df = pd.read_csv(config.train_path)
test_df = pd.read_csv(config.test_path)
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')
df = train_df.append(test_df).reset_index(drop = True)

def fix_location(dataframe: pd.DataFrame, feature: str = 'location') -> pd.DataFrame:

    tr_cities = load_tr_cities()
    df_ = dataframe.copy()
    df_.loc[df_[feature].astype(str).str.contains('Kahraman Maras'), feature] = 'Kahramanmaras, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Şanliurfa'), feature] = 'Sanliurfa, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('İçel'), feature] = 'Mersin, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Afyon'), feature] = 'Afyonkarahisar, Turkey'
    df_[feature] = df_[feature].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_[feature] = df_[feature].apply(lambda x: x.upper().strip())
    df_[feature] = df_[feature].apply(lambda x: translation(str(x)))
    for city in tr_cities:
        df_[feature] = df_[feature].apply(lambda x: city if city in x else x)
    df_[f'{feature}_based_on_tr'] = df_[feature].apply(lambda x: 1 if x in tr_cities or x == 'TURKEY' else 0)
        
    return df_

skills_df = load_skills(config.skills_path, 120, exact_match=False)
lang_df = load_languages(config.languages_path)
school_df = load_school(config.education_path, 50, exact_match = True)
degree_df = load_degree(config.education_path, 18, exact_match = True)
study_df = load_study(config.education_path, 55, exact_match = False)
exp_df = load_work_experiences(config.exp_path)

df = fix_location(df)
df = df.merge(skills_df, on = ['user_id'], how = 'left')
df = df.merge(lang_df, on = ['user_id'], how = 'left')
df = df.merge(school_df, on = ['user_id'], how = 'left')
df = df.merge(degree_df, on = ['user_id'], how = 'left')
df = df.merge(study_df, on = ['user_id'], how = 'left')
df = df.merge(exp_df, on = ['user_id'], how = 'left')
df = add_populations(df)
#df = add_employment(df)

df['nunique_company_by_industries'] = df.groupby(by = 'industry')['company_id'].transform('nunique')
df['active_employees_by_companies'] = df.groupby(by = 'company_id')['user_id'].transform('nunique')
df['nunique_industries_by_companies'] = df.groupby(by = 'company_id')['industry'].transform('nunique')

train_df = df.loc[df['moved_after_2019'].notnull()]
test_df = df.loc[df['moved_after_2019'].isnull()]
train_df, test_df = label_encode(["company_id", 'location'], train_df, test_df, fillna=True)
df = train_df.append(test_df).reset_index(drop = True)

df.head()

train_df shape: (53019, 4)
test_df shape: (13255, 3)


100%|██████████| 50/50 [00:01<00:00, 29.39it/s]
100%|██████████| 18/18 [00:00<00:00, 31.10it/s]


Unnamed: 0,user_id,industry,location,moved_after_2019,location_based_on_tr,skill_.net,skill_administration,skill_agile,skill_agile methodologies,skill_ajax,...,company_nunique_employees,company_lifetime,company_last_hire,avg_days_to_quit_diff,avg_days_to_quit_ratio,company_hire_ratio,population,nunique_company_by_industries,active_employees_by_companies,nunique_industries_by_companies
0,1301,Information Technology and Services,ISTANBUL,1.0,1,0.0,0.0,1.0,1.0,0.0,...,1410.0,11902.0,31.0,465.280537,2.040896,8.441135,15907951.0,3285.0,665.0,28.0
1,6950,Internet,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,66.0,1887.0,61.0,-131.480769,0.74968,28.590909,15907951.0,744.0,53.0,6.0
2,4880,Online Media,TURKEY,0.0,1,0.0,0.0,0.0,0.0,1.0,...,4.0,610.0,610.0,,,152.5,,32.0,4.0,1.0
3,26046,Telecommunications,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,1410.0,11902.0,31.0,-2496.719463,0.267609,8.441135,15907951.0,680.0,665.0,28.0
4,11005,Banking,ISTANBUL,0.0,1,1.0,0.0,0.0,0.0,0.0,...,678.0,5235.0,31.0,-299.166667,0.736726,7.721239,15907951.0,429.0,402.0,17.0


In [5]:
target = 'moved_after_2019'
cat_features = ['industry', 'location', 'company_id',
                'employee_last_experience_year',
                'employee_last_experience_month',
                #'employee_last_location',
                #'company_2th_id', 'company_3th_id'
                'employee_first_experience_year',
                #'employee_first_experience_month'
                ]
drop_features = ['user_id']

for caterogical_col in cat_features:
    df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

train_set = df.loc[df[target].notnull()].drop(columns=drop_features, axis = 1)
test_set = df.loc[df[target].isnull()].drop(columns=drop_features, axis = 1)

print(f'train_set: {train_set.shape}')
print(f'test_set: {test_set.shape}')

train_set: (53019, 290)
test_set: (13255, 290)


In [6]:
X = train_set.drop(columns = [target], axis = 1)
y = train_set[target]
skf = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=config.seed)

def objective(trial):
    
    params = {
              'max_depth': trial.suggest_int('max_depth', 6, 13),
              'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.03),
              'subsample': trial.suggest_float('subsample', 0.5, 0.99),
              'tree_method': trial.suggest_categorical('tree_method', ['hist']),
              'enable_categorical': trial.suggest_categorical('enable_categorical', [True]),
              'random_state': trial.suggest_categorical('random_state', [config.seed]),
              'n_estimators': trial.suggest_categorical('n_estimators', [5000]),
              "objective": trial.suggest_categorical('objective', ["binary:logistic"]),
              }

    #scores = list()
    #for idx, (train_ind, val_ind) in enumerate(skf.split(X, y)):
    #    print(f"fold: {idx+1}")
    #    model = XGBClassifier(**params)
    #    X_train = X.iloc[train_ind]
    #    y_train = y.iloc[train_ind]
    #    X_val = X.iloc[val_ind]
    #    y_val = y.iloc[val_ind]
#
    #    model.fit(
    #        X_train,
    #        y_train,
    #        eval_set=[(X_val, y_val)],
    #        early_stopping_rounds=500,
    #        verbose=False,
    #    )
#
    #    val_pred = model.predict(X_val)
    #    scores.append(accuracy_score(y_val, val_pred))
    #    del X_train, y_train, X_val, y_val
    #    gc.collect()

    return np.mean(scores)

In [7]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, gc_after_trial=True)

[32m[I 2023-02-24 02:48:25,258][0m A new study created in memory with name: no-name-58f370d1-4dbb-4aae-baae-f8f154aac031[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 03:09:41,028][0m Trial 0 finished with value: 0.7890377817416161 and parameters: {'max_depth': 12, 'learning_rate': 0.023456041389859038, 'subsample': 0.6639195818676509, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 0 with value: 0.7890377817416161.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 03:29:59,072][0m Trial 1 finished with value: 0.7865482547758439 and parameters: {'max_depth': 9, 'learning_rate': 0.02850759027704462, 'subsample': 0.5571214533980214, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 0 with value: 0.7890377817416161.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 03:56:00,413][0m Trial 2 finished with value: 0.7786643576200541 and parameters: {'max_depth': 6, 'learning_rate': 0.024714958709652846, 'subsample': 0.7458732190278717, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 0 with value: 0.7890377817416161.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 04:29:37,864][0m Trial 3 finished with value: 0.7846998017428201 and parameters: {'max_depth': 8, 'learning_rate': 0.012277874607545966, 'subsample': 0.8465991434718548, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 0 with value: 0.7890377817416161.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 05:00:23,169][0m Trial 4 finished with value: 0.7899621505501488 and parameters: {'max_depth': 11, 'learning_rate': 0.015238768735012887, 'subsample': 0.8023883794058948, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 4 with value: 0.7899621505501488.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 05:22:54,874][0m Trial 5 finished with value: 0.7884343269733126 and parameters: {'max_depth': 11, 'learning_rate': 0.02111281915838466, 'subsample': 0.9171492031145694, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 4 with value: 0.7899621505501488.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 05:57:59,995][0m Trial 6 finished with value: 0.7880571079804741 and parameters: {'max_depth': 10, 'learning_rate': 0.013849515737488254, 'subsample': 0.5209863401036159, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 4 with value: 0.7899621505501488.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 06:24:45,321][0m Trial 7 finished with value: 0.7826815744081631 and parameters: {'max_depth': 7, 'learning_rate': 0.02560079409413906, 'subsample': 0.5647782965347632, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 4 with value: 0.7899621505501488.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 06:49:28,959][0m Trial 8 finished with value: 0.7787773659429308 and parameters: {'max_depth': 6, 'learning_rate': 0.024895553214203985, 'subsample': 0.8336779189458778, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 4 with value: 0.7899621505501488.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 07:11:39,305][0m Trial 9 finished with value: 0.7842471139080183 and parameters: {'max_depth': 7, 'learning_rate': 0.028988215816605885, 'subsample': 0.9105408707452315, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 4 with value: 0.7899621505501488.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 08:11:02,627][0m Trial 10 finished with value: 0.7894527707284414 and parameters: {'max_depth': 13, 'learning_rate': 0.005928973695653176, 'subsample': 0.9868255639429704, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 4 with value: 0.7899621505501488.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 09:11:26,228][0m Trial 11 finished with value: 0.7889247022727292 and parameters: {'max_depth': 13, 'learning_rate': 0.0054079847513112365, 'subsample': 0.9888178033711534, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000, 'objective': 'binary:logistic'}. Best is trial 4 with value: 0.7899621505501488.[0m


fold: 1


[33m[W 2023-02-24 09:15:42,957][0m Trial 12 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_99820/2527391267.py", line 27, in objective
    model.fit(
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/xgboost/sklearn.py", line 1516, in fit
    self._Booster = train(
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/xgboost/training.py", line 185, in train
    bst.update(dtrain, i, obj)
  File "/Users/sercan

KeyboardInterrupt: 