In [1]:
import os
import gc
import optuna
import warnings
import numpy as np
from utils import *
import pandas as pd
from copy import deepcopy
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
warnings.filterwarnings('ignore')
%run skills.ipynb
%run languages.ipynb
%run work_experiences.ipynb
%run education.ipynb

class config:
    train_path = '../../../datasets/garanti-bbva-data-camp/train_users.csv'
    test_path = '../../../datasets/garanti-bbva-data-camp/test_users.csv'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/skills.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/languages.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/education.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 6

(1398443, 2)


  0%|          | 86/51156 [00:25<4:10:37,  3.40it/s]


KeyboardInterrupt: 

KeyboardInterrupt: 

In [None]:
train_df = pd.read_csv(config.train_path)
test_df = pd.read_csv(config.test_path)
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')
df = train_df.append(test_df).reset_index(drop = True)
df.loc[df['industry'] == '-1', 'industry'] = np.nan

def fix_location(dataframe: pd.DataFrame, feature: str = 'location') -> pd.DataFrame:

    tr_cities = load_tr_cities()
    df_ = dataframe.copy()
    df_.loc[df_[feature].astype(str).str.contains('Kahraman Maras'), feature] = 'Kahramanmaras, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Şanliurfa'), feature] = 'Sanliurfa, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('İçel'), feature] = 'Mersin, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Afyon'), feature] = 'Afyonkarahisar, Turkey'
    df_[feature] = df_[feature].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_[feature] = df_[feature].apply(lambda x: x.upper().strip())
    df_[feature] = df_[feature].apply(lambda x: translation(str(x)))
    df_[f'{feature}_based_on_tr'] = df_[feature].apply(lambda x: 1 if x in tr_cities or x == 'TURKEY' else 0)
    for city in tr_cities:
        df_[feature] = df_[feature].apply(lambda x: city if city in x else x)
        
    return df_

df = fix_location(df)

skills_df = load_skills(config.skills_path, 120)
lang_df = load_languages(config.languages_path, 10)
edu_df = load_education(config.education_path, 50, 50, 50)
exp_df = load_work_experiences(config.exp_path)

df = df.merge(skills_df, on = ['user_id'], how = 'left')
df = df.merge(lang_df, on = ['user_id'], how = 'left')
df = df.merge(edu_df, on = ['user_id'], how = 'left')
df = df.merge(exp_df, on = ['user_id'], how = 'left')

df['nunique_company_by_industries'] = df.groupby(by = 'industry')['company_id'].transform('nunique')

#df['nunique_employees_by_industries'] = df.groupby(by = 'industry')['user_id'].transform('nunique')
#df['nunique_locations_by_industries'] = df.groupby(by = 'industry')['location'].transform('nunique')

#df['employee_total_last_exp_diff'] = df['employee_total_experience'] - df['employee_last_experience']

#df['nunique_employees_by_industries'] = df.groupby(by = 'industry')['user_id'].transform('nunique')

print(df.shape)
df.head()

train_df shape: (53019, 4)
test_df shape: (13255, 3)
Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 326599


100%|██████████| 120/120 [00:03<00:00, 30.87it/s]
100%|██████████| 50/50 [00:01<00:00, 47.63it/s]


(66274, 311)


Unnamed: 0,user_id,industry,location,moved_after_2019,location_based_on_tr,skill_java,skill_javascript,skill_c#,skill_sql,skill_software development,...,employee_last_experience_month_cos,company_avg_days_to_quit,company_std_days_to_quit,company_max_days_to_quit,company_med_days_to_quit,company_skew_days_to_quit,company_nunique_employees,avg_days_to_quit_diff,avg_days_to_quit_ratio,nunique_company_by_industries
0,1301,Information Technology and Services,ISTANBUL,1.0,0,1.0,0.0,0.0,0.0,1.0,...,6.123234000000001e-17,912.280537,808.719538,5206.0,701.0,1.817045,1410.0,465.280537,2.040896,3285.0
1,6950,Internet,ISTANBUL,0.0,0,1.0,1.0,1.0,1.0,1.0,...,0.5,393.769231,376.905194,1155.0,184.0,1.387537,66.0,-131.480769,0.74968,744.0
2,4880,Online Media,TURKEY,0.0,1,0.0,1.0,1.0,1.0,0.0,...,-0.8660254,,,,,,4.0,,,32.0
3,26046,Telecommunications,ISTANBUL,0.0,0,0.0,0.0,0.0,0.0,0.0,...,-1.83697e-16,912.280537,808.719538,5206.0,701.0,1.817045,1410.0,-2496.719463,0.267609,680.0
4,11005,Banking,ISTANBUL,0.0,0,0.0,0.0,1.0,0.0,0.0,...,-0.5,837.166667,767.031502,3957.0,609.0,1.304231,678.0,-299.166667,0.736726,429.0


In [None]:
#def fill_industry_with_skills(dataframe: pd.DataFrame, skills_dataframe: pd.DataFrame) -> pd.DataFrame:
#
#    df_ = dataframe.copy()
#    skills_df_ = skills_dataframe.copy()
#
#    non_missing_df = df_.loc[df_["industry"].notnull()][
#        [col for col in skills_df_.columns] + ["industry"]
#    ].dropna(subset=[col for col in df_.columns if col.startswith("skill")])
#
#    search_df = df_.loc[df_["industry"].isnull()][
#        [col for col in skills_df_.columns] + ["industry"]
#    ].dropna(subset=[col for col in df_.columns if col.startswith("skill")])[
#        non_missing_df.columns
#    ]
#    
#    match_results = dict()
#    for idx, row in search_df.iterrows():
#        employee = row['user_id']
#        missing_data_point = [row[col] for col in search_df.columns if col not in ['user_id', 'industry']]
#        manhattan_dist = np.abs(non_missing_df.drop(columns = ['user_id', 'industry'], axis = 1) - missing_data_point).sum(axis=1)
#        match_df = non_missing_df.assign(dist=manhattan_dist).copy()
#        #print(f"Employee: {employee}")
#        #print(f"Minimum distance: {manhattan_dist.min()}")
#
#        if manhattan_dist.min() < 3:
#
#            i = 1
#            while i < 10:
#                avg_dist = match_df.sort_values("dist")[:i]["dist"].mean()
#                if avg_dist > 4.25:
#                    #print(avg_dist)
#                    #print(match_industry)
#                    break
#                match_industry = match_df.sort_values("dist")[:i]["industry"].mode().values[0]
#                #print(f"i: {i}")
#                #print(f"Average Distance: {avg_dist}")
#                #print(f"Industry Match: {match_industry}")
#                i += 1
#
#            match_results[employee] = match_industry
#        else:
#            match_industry = match_df.sort_values("dist")["industry"].values[0]
#            match_results[employee] = match_industry
#            continue
#        
#        del match_df
#        gc.collect()
#
#    print(f'industry matches: {len(match_results)}')
#    for key in match_results.keys():
#        df_.loc[df_['user_id'] == key, 'industry'] = match_results[key]
#
#    return df_
#
#print(f"industry missing values: {df['industry'].isnull().sum()}")
#df = fill_industry_with_skills(df, skills_df)
#df['industry'] = df['industry'].fillna('Computer Software')
#print(f"industry missing values: {df['industry'].isnull().sum()}")
#df.head()

In [None]:
target = 'moved_after_2019'
cat_features = ['industry', 'location', 'company_id', 'employee_last_experience_year', 'employee_last_experience_month']
drop_features = ['user_id']

for caterogical_col in cat_features:
    df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

train_set = df.loc[df[target].notnull()].drop(columns=drop_features, axis = 1)
test_set = df.loc[df[target].isnull()].drop(columns=drop_features, axis = 1)

print(f'train_set: {train_set.shape}')
print(f'test_set: {test_set.shape}')

X = train_set.drop(columns = [target], axis = 1)
y = train_set[target]
kf = KFold(n_splits=config.n_folds, shuffle =True, random_state=config.seed)

train_set: (53019, 310)
test_set: (13255, 310)


In [None]:
def objective(trial):

    params = {
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 90, 220),
        #"learning_rate": trial.suggest_float("learning_rate", 0.001, 0.05),
        "depth": trial.suggest_int("depth", 5, 14),
        "boosting_type": trial.suggest_categorical(
            "boosting_type", ["Ordered", "Plain"]
        ),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float(
            "bagging_temperature", 0, 10
        )
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    params["eval_metric"] = "Accuracy"
    params["cat_features"] = cat_features
    params["random_state"] = config.seed
    params["allow_writing_files"] = False
    params["iterations"] = 5000

    scores = list()
    for idx, (train_ind, val_ind) in enumerate(kf.split(X, y)):
        print(f"fold: {idx+1}")
        model = CatBoostClassifier(**params)
        X_train = X.iloc[train_ind]
        y_train = y.iloc[train_ind]
        X_val = X.iloc[val_ind]
        y_val = y.iloc[val_ind]

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=400,
            verbose=False,
        )

        val_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, val_pred))
        del X_train, y_train, X_val, y_val
        gc.collect()

    return np.mean(scores)


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40, gc_after_trial=True)

[32m[I 2023-02-14 03:17:30,911][0m A new study created in memory with name: no-name-5cd734b3-d20b-415b-98d9-79c001500916[0m


fold: 1


[33m[W 2023-02-14 03:24:48,897][0m Trial 0 failed because of the following error: KeyboardInterrupt('')[0m
Traceback (most recent call last):
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/67/wq_xsymd3jvc5w2fx1ld_18h0000gn/T/ipykernel_87610/2585085564.py", line 37, in objective
    model.fit(
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py", line 5128, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py", line 2355, in _fit
    self._train(
  File "/Users/sercanyesiloz/opt/anaconda3/lib/python3.9/site-packages/catboost/core.py", line 1759, in _train
    self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._ob

KeyboardInterrupt: 

In [None]:

#
    #if params["bootstrap_type"] == "Bayesian":
    #    params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    #elif params["bootstrap_type"] == "Bernoulli":
    #    params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
#
    #params['eval_metric'] = 'Accuracy'
    #params['cat_features'] = cat_features
    #params['random_state'] = config.seed
    #params['allow_writing_files'] = False
    #params["iterations"] = 5000