In [7]:
import os
import gc
import optuna
import numpy as np
from utils import *
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score
%run skills.ipynb
%run languages.ipynb
%run work_experiences.ipynb
%run education.ipynb

class config:
    train_path = '../../../datasets/garanti-bbva-data-camp/train_users.csv'
    test_path = '../../../datasets/garanti-bbva-data-camp/test_users.csv'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/skills.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/languages.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/education.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 6

In [8]:
train_df = pd.read_csv(config.train_path)
test_df = pd.read_csv(config.test_path)
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')
df = train_df.append(test_df).reset_index(drop = True)
df.head()

train_df shape: (53019, 4)
test_df shape: (13255, 3)


Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1.0
1,6950,Internet,"Istanbul, Istanbul, Turkey",0.0
2,4880,Online Media,Turkey,0.0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0.0
4,11005,Banking,"Istanbul, Turkey",0.0


In [9]:
def fix_location(dataframe: pd.DataFrame) -> pd.DataFrame:

    df_ = dataframe.copy()
    df_.loc[df_['location'].astype(str).str.contains('Kahraman Maras'), 'location'] = 'Kahramanmaras, Turkey'
    df_.loc[df_['location'].astype(str).str.contains('Şanliurfa'), 'location'] = 'Sanliurfa, Turkey'
    df_.loc[df_['location'].astype(str).str.contains('İçel'), 'location'] = 'Mersin, Turkey'
    df_.loc[df_['location'].astype(str).str.contains('Afyon'), 'location'] = 'Afyonkarahisar, Turkey'
    df_['location'] = df_['location'].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_['location'] = df_['location'].apply(lambda x: x.upper().strip())
    df_['location'] = df_['location'].apply(lambda x: translation(str(x)))
    tr_cities = load_tr_cities()
    for city in tr_cities:
        df_['location'] = df_['location'].apply(lambda x: city if city in x else x)
        df_['based_on_tr'] = df_['location'].apply(lambda x: 1 if x in tr_cities or x == 'TURKEY' else 0)

    return df_

df = fix_location(df)

skills_df = load_skills(config.skills_path, 30)
lang_df = load_languages(config.languages_path)
edu_df = load_education(config.education_path, 12)
exp_df = load_work_experiences(config.exp_path)

df = df.merge(skills_df, on = ['user_id'], how = 'left')
df = df.merge(lang_df, on = ['user_id'], how = 'left')
df = df.merge(edu_df, on = ['user_id'], how = 'left')
df = df.merge(exp_df, on = ['user_id'], how = 'left')

print(df.shape)
df.head()

Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 320842


100%|██████████| 30/30 [00:07<00:00,  4.22it/s]
100%|██████████| 12/12 [00:01<00:00,  7.64it/s]
100%|██████████| 12/12 [00:02<00:00,  4.23it/s]
100%|██████████| 10/10 [00:02<00:00,  4.31it/s]


(66274, 80)


Unnamed: 0,user_id,industry,location,moved_after_2019,based_on_tr,skill_Java,skill_JavaScript,skill_C#,skill_SQL,skill_Software Development,...,degree_Bachelor of Engineering,degree_Doctor of Philosophy,degree_Master of Business Administration,degree_Mühendislik Fakültesi Mezunu,total_experience,last_experience,nunique_company,employee_avg_days_to_quit,company_id,company_avg_days_to_quit
0,1301,Information Technology and Services,ISTANBUL,1.0,1,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1341.0,306.0,3.0,447.0,26.0,912.280537
1,6950,Internet,ISTANBUL,0.0,1,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,2101.0,699.0,4.0,525.25,1337.0,393.769231
2,4880,Online Media,TURKEY,0.0,1,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1826.0,610.0,4.0,456.5,4366.0,
3,26046,Telecommunications,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3409.0,1553.0,1.0,3409.0,26.0,912.280537
4,11005,Banking,ISTANBUL,0.0,1,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3409.0,640.0,3.0,1136.333333,1562.0,837.166667


In [10]:
target = 'moved_after_2019'
cat_features = ['industry', 'location']
drop_features = ['user_id']

for caterogical_col in cat_features:
    df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

train_set = df.loc[df[target].notnull()].drop(columns=drop_features, axis = 1)
test_set = df.loc[df[target].isnull()].drop(columns=drop_features, axis = 1)

print(f'train_set: {train_set.shape}')
print(f'test_set: {test_set.shape}')

train_set: (53019, 79)
test_set: (13255, 79)


In [11]:
#def objective(trial):
#
#    df_ = df.copy()
#    skill_size = trial.suggest_int('skill_size', 10, 50)
#    lang_size = trial.suggest_int('lang_size', 4, 20)
#    study_size = trial.suggest_int('study_size', 4, 20)
#    degree_size = trial.suggest_int('degree_size', 4, 20)
#
#    skills_df = load_skills(config.skills_path, skill_size)
#    lang_df = load_languages(config.languages_path, lang_size)
#    edu_df = load_education(config.education_path, study_size, degree_size)
#    exp_df = load_work_experiences(config.exp_path)
#
#    df_ = df_.merge(skills_df, on = ['user_id'], how = 'left')
#    df_ = df_.merge(lang_df, on = ['user_id'], how = 'left')
#    df_ = df_.merge(edu_df, on = ['user_id'], how = 'left')
#    df_ = df_.merge(exp_df, on = ['user_id'], how = 'left')
#
#    target = 'moved_after_2019'
#    cat_features = ['industry', 'location']
#    drop_features = ['user_id']
#
#    for caterogical_col in cat_features:
#        df_[caterogical_col] = df_[caterogical_col].astype(str).astype("category")
#
#    train_set = df_.loc[df_[target].notnull()].drop(columns=drop_features, axis = 1)
#    test_set = df_.loc[df_[target].isnull()].drop(columns=drop_features, axis = 1)
#
#    X = train_set.drop(columns = [target], axis = 1)
#    y = train_set[target]
#
#    params = {'one_hot_max_size': 179,
#              'depth': 12,
#              'boosting_type': 'Plain',
#              'bootstrap_type': 'MVS'}
#
#    params['verbose'] = False
#    params['random_state'] = config.seed
#    params['cat_features'] = cat_features
#    params['eval_metric'] = 'Accuracy'
#    params['allow_writing_files'] = False
#    params["iterations"] = 5000
#
#    model = CatBoostClassifier(**params)
#    kf = KFold(n_splits=config.n_folds)
#
#    scores = list()
#    for idx, (train_ind, val_ind) in enumerate(kf.split(X, y)):
#        print(f'fold: {idx+1}')
#        model = CatBoostClassifier(**params)
#        X_train = X.iloc[train_ind]
#        y_train = y.iloc[train_ind]
#        X_val = X.iloc[val_ind]
#        y_val = y.iloc[val_ind]
#
#        model.fit(
#            X_train,
#            y_train,
#            eval_set=[(X_val, y_val)],
#            early_stopping_rounds=400,
#            verbose=False,
#        )
#
#        val_pred = model.predict(X_val)
#        scores.append(accuracy_score(y_val, val_pred))
#        del X_train, y_train, X_val, y_val
#        gc.collect()
#
#    return np.mean(scores)
#
#study = optuna.create_study(direction="maximize")
#study.optimize(objective, n_trials=20)

{'skill_size': 48, 'lang_size': 10, 'study_size': 9, 'degree_size': 11}

In [12]:
X = train_set.drop(columns=[target], axis=1)
y = train_set[target]

def objective(trial):

    params = {
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 120, 200),
        #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 6, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    params['eval_metric'] = 'Accuracy'
    params['cat_features'] = cat_features
    params['random_state'] = config.seed
    params['allow_writing_files'] = False
    params["iterations"] = 5000

    model = CatBoostClassifier(**params)
    kf = KFold(n_splits=config.n_folds)

    scores = list()
    for idx, (train_ind, val_ind) in enumerate(kf.split(X, y)):
        print(f'fold: {idx+1}')
        model = CatBoostClassifier(**params)
        X_train = X.iloc[train_ind]
        y_train = y.iloc[train_ind]
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=400,
            verbose=False,
        )

        val_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, val_pred))
        del X_train, y_train, X_val, y_val
        gc.collect()

    return np.mean(scores)


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=8)

[32m[I 2023-02-07 21:00:34,506][0m A new study created in memory with name: no-name-7dfe71e7-65e5-4bdc-92cc-15b4c652f12a[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[32m[I 2023-02-07 21:24:04,892][0m Trial 0 finished with value: 0.7630470838663296 and parameters: {'one_hot_max_size': 133, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.959419387658166}. Best is trial 0 with value: 0.7630470838663296.[0m


fold: 1


[33m[W 2023-02-07 21:28:51,509][0m Trial 1 failed because of the following error: KeyboardInterrupt('')[0m
Traceback (most recent call last):
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_71305/1631085790.py", line 37, in objective
    model.fit(
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py", line 5128, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py", line 2355, in _fit
    self._train(
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py", line 1759, in _train
    self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._ob

KeyboardInterrupt: 

In [None]:
study.best_params