In [1]:
import os
import gc
import optuna
import warnings
import numpy as np
from utils import *
import pandas as pd
from copy import deepcopy
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from train_models import get_model_scores
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
warnings.filterwarnings('ignore')
%run skills.ipynb
%run languages.ipynb
%run work_experiences.ipynb
%run education.ipynb

class config:
    train_path = '../../../datasets/garanti-bbva-data-camp/train_users.csv'
    test_path = '../../../datasets/garanti-bbva-data-camp/test_users.csv'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/clean_skills_v2.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/clean_language.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/clean_education_v2.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 8

In [2]:
train_df = pd.read_csv(config.train_path)
test_df = pd.read_csv(config.test_path)
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')
df = train_df.append(test_df).reset_index(drop = True)

def fix_location(dataframe: pd.DataFrame, feature: str = 'location') -> pd.DataFrame:

    tr_cities = load_tr_cities()
    df_ = dataframe.copy()
    df_.loc[df_[feature].astype(str).str.contains('Kahraman Maras'), feature] = 'Kahramanmaras, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Şanliurfa'), feature] = 'Sanliurfa, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('İçel'), feature] = 'Mersin, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Afyon'), feature] = 'Afyonkarahisar, Turkey'
    df_[feature] = df_[feature].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_[feature] = df_[feature].apply(lambda x: x.upper().strip())
    df_[feature] = df_[feature].apply(lambda x: translation(str(x)))
    for city in tr_cities:
        df_[feature] = df_[feature].apply(lambda x: city if city in x else x)
    df_[f'{feature}_based_on_tr'] = df_[feature].apply(lambda x: 1 if x in tr_cities or x == 'TURKEY' else 0)
        
    return df_

skills_df = load_skills(config.skills_path, 120, exact_match=False)
lang_df = load_languages(config.languages_path)
school_df = load_school(config.education_path, 50, exact_match = True)
degree_df = load_degree(config.education_path, 18, exact_match = True)
study_df = load_study(config.education_path, 55, exact_match = False)
exp_df = load_work_experiences(config.exp_path)

df = fix_location(df)
df = df.merge(skills_df, on = ['user_id'], how = 'left')
df = df.merge(lang_df, on = ['user_id'], how = 'left')
df = df.merge(school_df, on = ['user_id'], how = 'left')
df = df.merge(degree_df, on = ['user_id'], how = 'left')
df = df.merge(study_df, on = ['user_id'], how = 'left')
df = df.merge(exp_df, on = ['user_id'], how = 'left')
df = add_populations(df)
#df = add_employment(df)

df['nunique_company_by_industries'] = df.groupby(by = 'industry')['company_id'].transform('nunique')
df['active_employees_by_companies'] = df.groupby(by = 'company_id')['user_id'].transform('nunique')
df['nunique_industries_by_companies'] = df.groupby(by = 'company_id')['industry'].transform('nunique')

df.head()

train_df shape: (53019, 4)
test_df shape: (13255, 3)


100%|██████████| 50/50 [00:01<00:00, 30.82it/s]
100%|██████████| 18/18 [00:00<00:00, 31.94it/s]


Unnamed: 0,user_id,industry,location,moved_after_2019,location_based_on_tr,skill_.net,skill_administration,skill_agile,skill_agile methodologies,skill_ajax,...,company_nunique_employees,company_lifetime,company_last_hire,avg_days_to_quit_diff,avg_days_to_quit_ratio,company_hire_ratio,population,nunique_company_by_industries,active_employees_by_companies,nunique_industries_by_companies
0,1301,Information Technology and Services,ISTANBUL,1.0,1,0.0,0.0,1.0,1.0,0.0,...,1410.0,11902.0,31.0,465.280537,2.040896,8.441135,15907951.0,3285.0,665.0,28.0
1,6950,Internet,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,66.0,1887.0,61.0,-131.480769,0.74968,28.590909,15907951.0,744.0,53.0,6.0
2,4880,Online Media,TURKEY,0.0,1,0.0,0.0,0.0,0.0,1.0,...,4.0,610.0,610.0,,,152.5,,32.0,4.0,1.0
3,26046,Telecommunications,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,1410.0,11902.0,31.0,-2496.719463,0.267609,8.441135,15907951.0,680.0,665.0,28.0
4,11005,Banking,ISTANBUL,0.0,1,1.0,0.0,0.0,0.0,0.0,...,678.0,5235.0,31.0,-299.166667,0.736726,7.721239,15907951.0,429.0,402.0,17.0


In [3]:
target = 'moved_after_2019'
cat_features = ['industry', 'location', 'company_id',
                'employee_last_experience_year',
                'employee_last_experience_month',
                #'employee_last_location',
                #'company_2th_id', 'company_3th_id'
                'employee_first_experience_year',
                #'employee_first_experience_month'
                ]
drop_features = ['user_id']

for caterogical_col in cat_features:
    df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

train_set = df.loc[df[target].notnull()].drop(columns=drop_features, axis = 1)
test_set = df.loc[df[target].isnull()].drop(columns=drop_features, axis = 1)

print(f'train_set: {train_set.shape}')
print(f'test_set: {test_set.shape}')

train_set: (53019, 290)
test_set: (13255, 290)


In [4]:
X = train_set.drop(columns = [target], axis = 1)
y = train_set[target]
skf = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=config.seed)

def objective(trial):

    params = {
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 90, 220),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.03),
        "depth": trial.suggest_int("depth", 6, 13),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    params["eval_metric"] = "Accuracy"
    params["cat_features"] = cat_features
    params["random_state"] = config.seed
    params["allow_writing_files"] = False
    params["iterations"] = 5000

    scores = list()
    for idx, (train_ind, val_ind) in enumerate(skf.split(X, y)):
        print(f"fold: {idx+1}")
        model = CatBoostClassifier(**params)
        X_train = X.iloc[train_ind]
        y_train = y.iloc[train_ind]
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=500,
            verbose=False,
        )

        val_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, val_pred))
        del X_train, y_train, X_val, y_val
        gc.collect()

    return np.mean(scores)

In [5]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, gc_after_trial=True)

[32m[I 2023-02-24 09:18:03,797][0m A new study created in memory with name: no-name-9e191a6b-6ecf-4966-8212-021368f761e8[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6
fold: 7
fold: 8


[32m[I 2023-02-24 09:32:42,796][0m Trial 0 finished with value: 0.7439788249210746 and parameters: {'one_hot_max_size': 147, 'learning_rate': 0.010086560938945707, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.14093818316493373}. Best is trial 0 with value: 0.7439788249210746.[0m


fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
fold: 6


[33m[W 2023-02-24 10:11:13,455][0m Trial 1 failed because of the following error: KeyboardInterrupt('')[0m
Traceback (most recent call last):
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_6190/2569096181.py", line 37, in objective
    model.fit(
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py", line 5128, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py", line 2355, in _fit
    self._train(
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py", line 1759, in _train
    self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._obj

KeyboardInterrupt: 