In [1]:
import os
import gc
import optuna
import warnings
import numpy as np
from utils import *
import pandas as pd
from copy import deepcopy
from feature_extraction import *
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from train_models import get_model_scores
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
warnings.filterwarnings('ignore')

class config:
    train_path = '../../../datasets/garanti-bbva-data-camp/train_users.csv'
    test_path = '../../../datasets/garanti-bbva-data-camp/test_users.csv'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/clean_skills.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/clean_language.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/clean_education.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 8
    study_size = 55
    school_size = 50
    language_size = 8
    degree_size = 18
    skill_size = 120
    skill_exact_match = False
    degree_exact_match = True
    school_exact_match = True
    study_exact_match = False

In [2]:
train_df = pd.read_csv(config.train_path)
test_df = pd.read_csv(config.test_path)
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')
df = train_df.append(test_df).reset_index(drop = True)

df.head()

train_df shape: (53019, 4)
test_df shape: (13255, 3)


Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1.0
1,6950,Internet,"Istanbul, Istanbul, Turkey",0.0
2,4880,Online Media,Turkey,0.0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0.0
4,11005,Banking,"Istanbul, Turkey",0.0


In [3]:
def fix_location(dataframe: pd.DataFrame, feature: str = "location") -> pd.DataFrame:

    tr_cities = load_tr_cities()
    df_ = dataframe.copy()
    df_.loc[
        df_[feature].astype(str).str.contains("Kahraman Maras"), feature
    ] = "Kahramanmaras, Turkey"
    df_.loc[
        df_[feature].astype(str).str.contains("Şanliurfa"), feature
    ] = "Sanliurfa, Turkey"
    df_.loc[df_[feature].astype(str).str.contains("İçel"), feature] = "Mersin, Turkey"
    df_.loc[
        df_[feature].astype(str).str.contains("Afyon"), feature
    ] = "Afyonkarahisar, Turkey"
    df_[feature] = df_[feature].apply(lambda x: str(x).replace("Türkiye", "Turkey"))
    df_[feature] = df_[feature].apply(lambda x: x.upper().strip())
    df_[feature] = df_[feature].apply(lambda x: translation(str(x)))
    for city in tr_cities:
        df_[feature] = df_[feature].apply(lambda x: city if city in x else x)
    df_[f"{feature}_based_on_tr"] = df_[feature].apply(
        lambda x: 1 if x in tr_cities or x == "TURKEY" else 0
    )

    return df_


skills_df = load_skills(
    config.skills_path, config.skill_size, exact_match=config.skill_exact_match
)
school_df = load_school(
    config.education_path, config.school_size, exact_match=config.school_exact_match
)
degree_df = load_degree(
    config.education_path, config.degree_size, exact_match=config.degree_exact_match
)
study_df = load_study(
    config.education_path, config.study_size, exact_match=config.study_exact_match
)
lang_df = load_languages(config.languages_path, config.language_size)
exp_df = load_work_experiences(config.exp_path)

df = fix_location(df)
df = df.merge(skills_df, on=["user_id"], how="left")
df = df.merge(lang_df, on=["user_id"], how="left")
df = df.merge(school_df, on=["user_id"], how="left")
df = df.merge(degree_df, on=["user_id"], how="left")
df = df.merge(study_df, on=["user_id"], how="left")
df = df.merge(exp_df, on=["user_id"], how="left")
df = add_populations(df)
# df = add_employment(df)

df["nunique_company_by_industries"] = df.groupby(by="industry")["company_id"].transform(
    "nunique"
)
df["active_employees_by_companies"] = df.groupby(by="company_id")["user_id"].transform(
    "nunique"
)
# df['nunique_industries_by_companies'] = df.groupby(by = 'company_id')['industry'].transform('nunique')

# df['active_employees_by_industries'] = df.groupby(by = 'industry')['user_id'].transform('nunique')
# df['nunique_locations_by_industries'] = df.groupby(by = 'industry')['location'].transform('nunique')

# train_df = df.loc[df['moved_after_2019'].notnull()]
# test_df = df.loc[df['moved_after_2019'].isnull()]
# train_df, test_df = label_encode(["company_id", 'location'], train_df, test_df, fillna=True)
# df = train_df.append(test_df).reset_index(drop = True)

print(df.shape)
df.head()


100%|██████████| 50/50 [00:01<00:00, 30.20it/s]
100%|██████████| 18/18 [00:00<00:00, 31.77it/s]


(66274, 291)


Unnamed: 0,user_id,industry,location,moved_after_2019,location_based_on_tr,skill_.net,skill_administration,skill_agile,skill_agile methodologies,skill_ajax,...,company_skew_days_to_quit,company_nunique_employees,company_lifetime,company_last_hire,avg_days_to_quit_diff,avg_days_to_quit_ratio,company_hire_ratio,population,nunique_company_by_industries,active_employees_by_companies
0,1301,Information Technology and Services,ISTANBUL,1.0,1,0.0,0.0,1.0,1.0,0.0,...,1.817045,1410.0,11902.0,31.0,465.280537,2.040896,8.441135,15907951.0,3285.0,665.0
1,6950,Internet,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,1.387537,66.0,1887.0,61.0,-131.480769,0.74968,28.590909,15907951.0,744.0,53.0
2,4880,Online Media,TURKEY,0.0,1,0.0,0.0,0.0,0.0,1.0,...,,4.0,610.0,610.0,,,152.5,,32.0,4.0
3,26046,Telecommunications,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,1.817045,1410.0,11902.0,31.0,-2496.719463,0.267609,8.441135,15907951.0,680.0,665.0
4,11005,Banking,ISTANBUL,0.0,1,1.0,0.0,0.0,0.0,0.0,...,1.304231,678.0,5235.0,31.0,-299.166667,0.736726,7.721239,15907951.0,429.0,402.0


In [4]:
#df.loc[df['industry'] == '-1', 'industry'] = np.nan
#
#def fill_industry_with_skills(dataframe: pd.DataFrame, skills_dataframe: pd.DataFrame) -> pd.DataFrame:
#
#    df_ = dataframe.copy()
#    skills_df_ = skills_dataframe.copy()
#
#    non_missing_df = df_.loc[df_["industry"].notnull()][
#        [col for col in skills_df_.columns] + ["industry"]
#    ].dropna(subset=[col for col in df_.columns if col.startswith("skill")])
#
#    search_df = df_.loc[df_["industry"].isnull()][
#        [col for col in skills_df_.columns] + ["industry"]
#    ].dropna(subset=[col for col in df_.columns if col.startswith("skill")])[
#        non_missing_df.columns
#    ]
#    
#    match_results = dict()
#    for idx, row in search_df.iterrows():
#        employee = row['user_id']
#        missing_data_point = [row[col] for col in search_df.columns if col not in ['user_id', 'industry']]
#        manhattan_dist = np.abs(non_missing_df.drop(columns = ['user_id', 'industry'], axis = 1) - missing_data_point).sum(axis=1)
#        match_df = non_missing_df.assign(dist=manhattan_dist).copy()
#
#        if manhattan_dist.min() < 3:
#            i = 1
#            while i < 20:
#                avg_dist = match_df.sort_values("dist")[:i]["dist"].mean()
#                if avg_dist > 4.25:
#                    break
#                match_industry = match_df.sort_values("dist")[:i]["industry"].mode().values[0]
#                i += 1
#            match_results[employee] = match_industry
#        else:
#            match_industry = match_df.sort_values("dist")["industry"].values[0]
#            match_results[employee] = match_industry
#            continue
#        
#        del match_df
#        gc.collect()
#
#    print(f'industry matches: {len(match_results)}')
#    for key in match_results.keys():
#        df_.loc[df_['user_id'] == key, 'industry'] = match_results[key]
#
#    return df_
#
#print(f"industry missing values: {df['industry'].isnull().sum()}")
#df = fill_industry_with_skills(df, skills_df)
#df['industry'] = df['industry'].fillna('Computer Software')
#df['location'] = df['location'].fillna('ISTANBUL')
#print(f"industry missing values: {df['industry'].isnull().sum()}")
#df.head()

In [5]:
check_missing(df)[:20]

Unnamed: 0,feature,n_missing,missing_ratio
265,employee_std_days_to_quit,43234,0.652352
281,company_skew_days_to_quit,29310,0.442255
266,employee_med_days_to_quit,29249,0.441334
262,employee_last_days_to_quit,29249,0.441334
263,employee_min_days_to_quit,29249,0.441334
264,employee_max_days_to_quit,29249,0.441334
126,language_arabic,28984,0.437336
134,total_languages,28984,0.437336
133,language_turkish,28984,0.437336
131,language_russian,28984,0.437336


In [6]:
target = "moved_after_2019"
cat_features = [
    "industry",
    "location",
    "company_id",
    "employee_last_experience_year",
    "employee_last_experience_month",
    #'employee_last_location',
    #'company_2th_id', 'company_3th_id'
    "employee_first_experience_year",
    #'employee_first_experience_month'
]
drop_features = ["user_id"]

regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in df.columns.values
]

for caterogical_col in cat_features:
    df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

train_set = df.loc[df[target].notnull()].drop(columns=drop_features, axis=1)
test_set = df.loc[df[target].isnull()].drop(columns=drop_features, axis=1)

print(f"train_set: {train_set.shape}")
print(f"test_set: {test_set.shape}")


train_set: (53019, 290)
test_set: (13255, 290)


In [7]:
#cat1 = CatBoostClassifier(
#    **{
#        "one_hot_max_size": 150,
#        "depth": 12,
#        "learning_rate": 0.02,
#        "boosting_type": "Plain",
#        "bootstrap_type": "MVS",
#        "iterations": 4000,
#        "random_state": config.seed,
#        "cat_features": cat_features,
#        "eval_metric": "Accuracy",
#        "allow_writing_files": False,
#        "verbose": False,
#    }
#)

cat1 = CatBoostClassifier(
    **{
        "one_hot_max_size": 138,
        "depth": 11,
        'learning_rate': 0.0175,
        "boosting_type": "Plain",
        "bootstrap_type": "Bernoulli",
        'subsample': 0.81,
        "iterations": 5000,
        "random_state": config.seed,
        "cat_features": cat_features,
        "eval_metric": "Accuracy",
        "allow_writing_files": False,
        "verbose": False,
    }
)

xgb1 = XGBClassifier(
    **{
        "max_depth": 12,
        "subsample": 0.84,
        "learning_rate": 0.012,
        "enable_categorical": True,
        "random_state": config.seed,
        "tree_method": "hist",
        "n_estimators": 5000,
    }
)

xgb2 = XGBClassifier(
    **{
        "max_depth": 11,
        "learning_rate": 0.015238768735012887,
        "subsample": 0.8023883794058948,
        "tree_method": "hist",
        "enable_categorical": True,
        "random_state": config.seed,
        "n_estimators": 5000,
        "objective": "binary:logistic",
    }
)

skf = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=config.seed)


In [8]:
xgb1_scores, xgb1_train_scores = get_model_scores(xgb1, skf, train_set, test_set, 'moved_after_2019', config.n_folds, False)

-----------------------------------| Fold 1 |-----------------------------------
train: (46391, 289)
val: (6628, 289)
[0]	validation_0-logloss:0.69067
[250]	validation_0-logloss:0.53616
[500]	validation_0-logloss:0.50396
[750]	validation_0-logloss:0.48310
[1000]	validation_0-logloss:0.46778
[1250]	validation_0-logloss:0.45644
[1500]	validation_0-logloss:0.44814
[1750]	validation_0-logloss:0.44294
[2000]	validation_0-logloss:0.43952
[2250]	validation_0-logloss:0.43737
[2500]	validation_0-logloss:0.43648
[2750]	validation_0-logloss:0.43655
[3000]	validation_0-logloss:0.43668
[3137]	validation_0-logloss:0.43742
fold accuracy: 0.7883222691611346
-----------------------------------| Fold 2 |-----------------------------------
train: (46391, 289)
val: (6628, 289)
[0]	validation_0-logloss:0.69068
[250]	validation_0-logloss:0.53611
[500]	validation_0-logloss:0.50571
[750]	validation_0-logloss:0.48506
[1000]	validation_0-logloss:0.46936
[1250]	validation_0-logloss:0.45736
[1500]	validation_0-lo

accuracy: 0.7904336181368943 <br>
folds avg accuracy: 0.7904335839357636

In [9]:
cat1_scores, cat1_train_scores = get_model_scores(cat1, skf, train_set, test_set, 'moved_after_2019', config.n_folds, False)

-----------------------------------| Fold 1 |-----------------------------------
train: (46391, 289)
val: (6628, 289)
0:	learn: 0.6577784	test: 0.6374472	best: 0.6374472 (0)	total: 247ms	remaining: 20m 33s
250:	learn: 0.7785777	test: 0.7371756	best: 0.7373265 (248)	total: 48.8s	remaining: 15m 24s
500:	learn: 0.8164730	test: 0.7506035	best: 0.7507544 (498)	total: 1m 38s	remaining: 14m 43s
750:	learn: 0.8565886	test: 0.7626735	best: 0.7643331 (725)	total: 2m 28s	remaining: 14m 1s
1000:	learn: 0.8922851	test: 0.7724804	best: 0.7724804 (1000)	total: 3m 19s	remaining: 13m 15s
1250:	learn: 0.9153068	test: 0.7780628	best: 0.7785154 (1238)	total: 4m 9s	remaining: 12m 26s
1500:	learn: 0.9338234	test: 0.7812311	best: 0.7819855 (1489)	total: 4m 59s	remaining: 11m 37s
1750:	learn: 0.9487832	test: 0.7836451	best: 0.7840978 (1748)	total: 5m 49s	remaining: 10m 48s
2000:	learn: 0.9604018	test: 0.7839469	best: 0.7851539 (1988)	total: 6m 39s	remaining: 9m 58s
2250:	learn: 0.9698864	test: 0.7856065	best:

accuracy: 0.7922631509458873 <br>
folds avg accuracy: 0.7922632031969215

In [10]:
thr = 0.499
score_ = list()
w_range = np.arange(0.02, 0.98, 0.02)
for i in w_range:
    score_.append(
        accuracy_score(
            train_set[target],
            np.where(
                cat1_train_scores[:, 1] * i + xgb1_train_scores[:, 1] * (1 - i)
                >= thr,
                1,
                0,
            ),
        )
    )

max_score = np.max(score_)
print(f"max score: {max_score}")
w1 = round(w_range[np.argmax(score_)], 2)
w2 = round(1-w1, 2)
print(f"weight w highest accuracy: {w1} - {w2}")

max score: 0.7941869895697768
weight w highest accuracy: 0.54 - 0.46


In [11]:
sub[target] = np.where((cat1_scores[:, 1] * w1) + (xgb1_scores[:, 1] * w2)  >= thr, 1, 0)

print(sub[target].value_counts())

sub.head(10)

0    9056
1    4199
Name: moved_after_2019, dtype: int64


Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,1
2,2110,0
3,55082,0
4,37165,0
5,42841,1
6,8407,0
7,17053,0
8,24437,1
9,44394,0


In [12]:
sub.to_csv(f'../submissions/ensemble_cat_xgb_{round(max_score, 6)}.csv', index = False)

In [13]:
#cat1_v2 = CatBoostClassifier(
#    **{
#        "one_hot_max_size": 150,
#        "depth": 12,
#        'learning_rate': 0.02,
#        "boosting_type": "Plain",
#        "bootstrap_type": "MVS",
#        "iterations": 4000,
#        "random_state": config.seed,
#        "cat_features": cat_features,
#        "eval_metric": "Accuracy",
#        "allow_writing_files": False,
#        "verbose": False,
#    }
#)
#
#cat2 = CatBoostClassifier(
#    **{
#        "one_hot_max_size": 180,
#        "depth": 12,
#        'learning_rate': 0.02,
#        "boosting_type": "Plain",
#        "bootstrap_type": "Bernoulli",
#        'subsample': 0.85,
#        "iterations": 5000,
#        "random_state": config.seed,
#        "cat_features": cat_features,
#        "eval_metric": "Accuracy",
#        "allow_writing_files": False,
#        "verbose": False,
#    }
#)
#
#xgb1 = XGBClassifier(
#    **{
#        "max_depth": 12,
#        'subsample': 0.84,
#        'learning_rate': 0.012,
#        #"max_cat_to_onehot": 170,
#        'enable_categorical': True,
#        'random_state': config.seed,
#        'tree_method': 'hist',
#        'n_estimators': 5000
#    }
#)
#
##optuna accuracy: 0.7913200717625333
#xgb2 = XGBClassifier(
#    **{
#        'max_depth': 10,
#        'colsample_bylevel': 0.5278208423543558,
#        'learning_rate': 0.013365198710298768,
#        'subsample': 0.8970592179744503,
#        'tree_method': 'hist',
#        'enable_categorical': True,
#        'random_state': config.seed,
#        'n_estimators': 5000,
#        'objective': 'binary:logistic'
#    }
#)
#
#xgb2 = XGBClassifier(
#    **{
#        'max_depth': 8,
#        'colsample_bytree': 0.9022539835244943,
#        'learning_rate': 0.029665791085370145,
#        'subsample': 0.10503841501004575,
#        'tree_method': 'hist',
#        'enable_categorical': True,
#        'random_state': 42,
#        'n_estimators': 5000,
#        'objective': 'binary:logistic',
#        'gamma': 2.846243837248615e-05
#    }
#)
#
#xgb1_v2 = XGBClassifier(
#    **{
#        "max_depth": 12,
#        'subsample': 0.84,
#        'learning_rate': 0.012,
#        #"max_cat_to_onehot": 170,
#        'enable_categorical': True,
#        'random_state': config.seed,
#        'tree_method': 'hist',
#        'n_estimators': 5000
#    }
#)
#
#xgb4 = XGBClassifier(
#    **{
#        "max_depth": 13,
#        'subsample': 0.88,
#        'learning_rate': 0.018,
#        'colsample_bylevel': 0.72,
#        'enable_categorical': True,
#        'random_state': config.seed,
#        'tree_method': 'hist',
#        'n_estimators': 5000
#    }
#)
#
##optuna accuracy: 0.791263431000755
#xgb7 = XGBClassifier(
#    **{
#        'max_depth': 13,
#        'colsample_bylevel': 0.5015371836125226,
#        'learning_rate': 0.01378678089792197,
#        'subsample': 0.8887196385241469,
#        'tree_method': 'hist',
#        'enable_categorical': True,
#        'random_state': config.seed,
#        'n_estimators': 5000,
#        'objective': 'binary:logistic'
#    }
#)