In [1]:
import os
import gc
import optuna
import warnings
import numpy as np
from utils import *
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report
warnings.filterwarnings('ignore')
%run skills.ipynb
%run languages.ipynb
%run work_experiences.ipynb
%run education.ipynb

class config:
    train_path = '../../../datasets/garanti-bbva-data-camp/train_users.csv'
    test_path = '../../../datasets/garanti-bbva-data-camp/test_users.csv'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/skills.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/languages.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/education.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 6

In [2]:
train_df = pd.read_csv(config.train_path)
test_df = pd.read_csv(config.test_path)
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')
df = train_df.append(test_df).reset_index(drop = True)
df.head()

train_df shape: (53019, 4)
test_df shape: (13255, 3)


Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1.0
1,6950,Internet,"Istanbul, Istanbul, Turkey",0.0
2,4880,Online Media,Turkey,0.0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0.0
4,11005,Banking,"Istanbul, Turkey",0.0


In [3]:
def fix_location(dataframe: pd.DataFrame, feature: str = 'location') -> pd.DataFrame:

    df_ = dataframe.copy()
    df_.loc[df_[feature].astype(str).str.contains('Kahraman Maras'), feature] = 'Kahramanmaras, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Şanliurfa'), feature] = 'Sanliurfa, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('İçel'), feature] = 'Mersin, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Afyon'), feature] = 'Afyonkarahisar, Turkey'
    df_[feature] = df_[feature].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_[feature] = df_[feature].apply(lambda x: x.upper().strip())
    df_[feature] = df_[feature].apply(lambda x: translation(str(x)))
    tr_cities = load_tr_cities()
    for city in tr_cities:
        df_[feature] = df_[feature].apply(lambda x: city if city in x else x)
        df_[f'{feature}_based_on_tr'] = df_[feature].apply(lambda x: 1 if x in tr_cities or x == 'TURKEY' else 0)

    return df_

df = fix_location(df)
df.head()

Unnamed: 0,user_id,industry,location,moved_after_2019,location_based_on_tr
0,1301,Information Technology and Services,ISTANBUL,1.0,1
1,6950,Internet,ISTANBUL,0.0,1
2,4880,Online Media,TURKEY,0.0,1
3,26046,Telecommunications,ISTANBUL,0.0,1
4,11005,Banking,ISTANBUL,0.0,1


In [4]:
def objective(trial):

    df_ = df.copy()
    skill_size = trial.suggest_int('skill_size', 90, 130)
    lang_size = trial.suggest_int('lang_size', 15, 30)
    study_size = trial.suggest_int('study_size', 100, 220)
    degree_size = trial.suggest_int('degree_size', 130, 160)

    skills_df = load_skills(config.skills_path, skill_size)
    lang_df = load_languages(config.languages_path, lang_size)
    edu_df = load_education(config.education_path, study_size, degree_size)
    exp_df = load_work_experiences(config.exp_path)

    df_ = df_.merge(skills_df, on = ['user_id'], how = 'left')
    del skills_df
    gc.collect()
    df_ = df_.merge(lang_df, on = ['user_id'], how = 'left')
    del lang_df
    gc.collect()
    df_ = df_.merge(edu_df, on = ['user_id'], how = 'left')
    del edu_df
    gc.collect()
    df_ = df_.merge(exp_df, on = ['user_id'], how = 'left')
    del exp_df
    gc.collect()
    df_['nunique_company_by_industries'] = df_.groupby(by = 'industry')['company_id'].transform('nunique')

    target = 'moved_after_2019'
    cat_features = ['industry', 'location', 'company_id', 'employee_last_experience_year', 'employee_last_experience_month']
    drop_features = ['user_id']

    for caterogical_col in cat_features:
        df_[caterogical_col] = df_[caterogical_col].astype(str).astype("category")

    train_set = df_.loc[df_[target].notnull()].drop(columns=drop_features, axis = 1)
    del df_
    gc.collect()
    #test_set = df_.loc[df_[target].isnull()].drop(columns=drop_features, axis = 1)

    X = train_set.drop(columns = [target], axis = 1)
    y = train_set[target]
    del train_set
    gc.collect()

    #params = {
    #    "one_hot_max_size": trial.suggest_int("one_hot_max_size", 100, 220),
    #    "depth": trial.suggest_int("depth", 5, 12),
    #    "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
    #    "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
    #}
#
    #if params["bootstrap_type"] == "Bayesian":
    #    params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    #elif params["bootstrap_type"] == "Bernoulli":
    #    params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
#
    #params['eval_metric'] = 'Accuracy'
    #params['cat_features'] = cat_features
    #params['random_state'] = config.seed
    #params['allow_writing_files'] = False
    #params["iterations"] = 5000

    # XGB Parameters
    params = {
              'max_depth': trial.suggest_int('max_depth', 4, 13),
              'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.04),
              #'n_estimators': trial.suggest_int('n_estimators', 50, 4000),
              #'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
              "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
              'subsample': trial.suggest_float('subsample', 0.7, 0.99),
              'tree_method': trial.suggest_categorical('tree_method', ['hist']),
              'enable_categorical': trial.suggest_categorical('enable_categorical', [True]),
              'random_state': trial.suggest_categorical('random_state', [config.seed]),
              'n_estimators': trial.suggest_categorical('n_estimators', [5000]),
              "objective": trial.suggest_categorical('objective', ["binary:logistic"]),
              "eval_metric": trial.suggest_categorical('eval_metric', ["auc"]),
              }

    if params["booster"] == "gbtree" or params["booster"] == "dart":
        params["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        params["eta"] = trial.suggest_loguniform("eta", 1e-8, 1.0)
        params["gamma"] = trial.suggest_loguniform("gamma", 1e-8, 1.0)
        params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if params["booster"] == "dart":
        params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        params["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
        params["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)
#

    #params['enable_categorical'] = True
    #params['random_state'] = config.seed
    #params['tree_method'] = 'hist'
    #params['n_estimators'] = 5000

    #kf = KFold(n_splits=config.n_folds)
    kf = KFold(n_splits=config.n_folds, shuffle =True, random_state=config.seed)

    scores = cross_val_score(XGBClassifier(**params), X, y, cv = kf, scoring='accuracy', n_jobs = -1)

    return np.mean(scores)

    #scores = list()
    #for idx, (train_ind, val_ind) in enumerate(kf.split(X, y)):
    #    print(f'fold: {idx+1}')
    #    model = CatBoostClassifier(**params)
    #    #model = XGBClassifier(**params)
    #    X_train = X.iloc[train_ind]
    #    y_train = y.iloc[train_ind]
    #    X_val = X.iloc[val_ind]
    #    y_val = y.iloc[val_ind]
#
    #    model.fit(
    #        X_train,
    #        y_train,
    #        eval_set=[(X_val, y_val)],
    #        early_stopping_rounds=400,
    #        verbose=False,
    #    )
#
    #    val_pred = model.predict(X_val)
    #    scores.append(accuracy_score(y_val, val_pred))
    #    del X_train, y_train, X_val, y_val
    #    gc.collect()
#
    #return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40, gc_after_trial=True)

[32m[I 2023-02-11 00:19:39,055][0m A new study created in memory with name: no-name-04db221a-6fd8-424b-ada8-aa6bb934198a[0m


Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 325856
Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Parameters: { "max_depth", "subsample", "tree_method" } are not used.



[33m[W 2023-02-11 00:21:00,558][0m Trial 0 failed because of the following error: The value nan is not acceptable.[0m


Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 325856
Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Parameters: { "max_depth", "subsample", "tree_method" } are not used.



[33m[W 2023-02-11 00:22:21,972][0m Trial 1 failed because of the following error: The value nan is not acceptable.[0m


Parameters: { "max_depth", "subsample", "tree_method" } are not used.

Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 325856


[33m[W 2023-02-11 00:22:56,804][0m Trial 2 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_93405/1896041416.py", line 11, in objective
    edu_df = load_education(config.education_path, study_size, degree_size)
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_93405/622072884.py", line 171, in load_education
    df_.loc[df_['fields_of_study'] == 'elektronik ve haberleşme mühendisi', 'fields_of_study'] = 'Electronics and Communication Engineering'
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py", line 723, in __setitem__
    iloc._setitem_with_indexer(indexer, value, self.name)
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/panda

KeyboardInterrupt: 

Trial 13 finished with value: 0.782455377517048 and parameters: {'skill_size': 106, 'lang_size': 27, 'study_size': 197, 'degree_size': 143, 'max_depth': 14, 'learning_rate': 0.017307888942046504, 'subsample': 0.8412208136691927, 'tree_method': 'hist', 'enable_categorical': True, 'random_state': 42, 'n_estimators': 5000}. Best is trial 13 with value: 0.782455377517048.

In [None]:
study.best_params

{'skill_size': 48, 'lang_size': 10, 'study_size': 9, 'degree_size': 11}