In [1]:
import os
import gc
import optuna
import numpy as np
from utils import *
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score
%run skills.ipynb
%run languages.ipynb
%run work_experiences.ipynb
%run education.ipynb

class config:
    path = '../../../datasets/garanti-bbva-data-camp/'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/skills.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/languages.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/education.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 6

In [2]:
train_df = pd.read_csv(os.path.join(config.path, 'train_users.csv'))
test_df = pd.read_csv(os.path.join(config.path, 'test_users.csv'))
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')

train_df shape: (53019, 4)
test_df shape: (13255, 3)


In [3]:
df = train_df.append(test_df).reset_index(drop = True)

df.head()

Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1.0
1,6950,Internet,"Istanbul, Istanbul, Turkey",0.0
2,4880,Online Media,Turkey,0.0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0.0
4,11005,Banking,"Istanbul, Turkey",0.0


In [4]:
#df.loc[df['location'].astype(str).str.contains('Afyon')]

In [5]:
def fix_location(dataframe: pd.DataFrame) -> pd.DataFrame:

    df_ = dataframe.copy()
    df_.loc[df_['location'].astype(str).str.contains('Kahraman Maras'), 'location'] = 'Kahramanmaras, Turkey'
    df_.loc[df_['location'].astype(str).str.contains('Şanliurfa'), 'location'] = 'Sanliurfa, Turkey'
    df_.loc[df_['location'].astype(str).str.contains('İçel'), 'location'] = 'Mersin, Turkey'
    df_.loc[df_['location'].astype(str).str.contains('Afyon'), 'location'] = 'Afyonkarahisar, Turkey'
    df_['location'] = df_['location'].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_['location'] = df_['location'].apply(lambda x: x.upper().strip())
    df_['location'] = df_['location'].apply(lambda x: translation(str(x)))
    tr_cities = load_tr_cities()
    for city in tr_cities:
        df_['location'] = df_['location'].apply(lambda x: city if city in x else x)
        df_['based_on_tr'] = df_['location'].apply(lambda x: 1 if x in tr_cities or x == 'TURKEY' else 0)

    return df_

In [6]:
df = fix_location(df)

skills_df = load_skills(config.skills_path, 30)
lang_df = load_languages(config.languages_path)
edu_df = load_education(config.education_path, 12)
#exp_df = load_work_experiences(config.exp_path)

df = df.merge(skills_df, on = ['user_id'], how = 'left')
df = df.merge(lang_df, on = ['user_id'], how = 'left')
df = df.merge(edu_df, on = ['user_id'], how = 'left')
#df = df.merge(exp_df, on = ['user_id'], how = 'left')

print(df.shape)
df.head()

Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 318624


100%|███████████████████████████████████████████| 30/30 [00:07<00:00,  4.12it/s]
100%|███████████████████████████████████████████| 12/12 [00:01<00:00,  7.47it/s]
100%|███████████████████████████████████████████| 12/12 [00:02<00:00,  4.18it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  4.19it/s]


(66274, 74)


Unnamed: 0,user_id,industry,location,moved_after_2019,based_on_tr,skill_Java,skill_JavaScript,skill_SQL,skill_C#,skill_Software Development,...,degree_Bachelor's degree,degree_Master's degree,degree_Bachelor of Science,degree_High School,degree_Master of Science,degree_Associate's degree,degree_Bachelor of Engineering,degree_Doctor of Philosophy,degree_Master of Business Administration,degree_Mühendislik Fakültesi Mezunu
0,1301,Information Technology and Services,ISTANBUL,1.0,1,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,6950,Internet,ISTANBUL,0.0,1,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4880,Online Media,TURKEY,0.0,1,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26046,Telecommunications,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11005,Banking,ISTANBUL,0.0,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
target = 'moved_after_2019'
cat_features = ['industry', 'location']
drop_features = ['user_id']

for caterogical_col in cat_features:
    df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

train_set = df.loc[df[target].notnull()].drop(columns=drop_features, axis = 1)
test_set = df.loc[df[target].isnull()].drop(columns=drop_features, axis = 1)

print(f'train_set: {train_set.shape}')
print(f'test_set: {test_set.shape}')

train_set: (53019, 73)
test_set: (13255, 73)


In [8]:
#X = train_set.drop(columns=[target], axis=1)
#y = train_set[target]
#
#def objective(trial):
#
#    params = {
#        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 120, 200),
#        #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#        "depth": trial.suggest_int("depth", 6, 12),
#        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
#    }
#
#    if params["bootstrap_type"] == "Bayesian":
#        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
#    elif params["bootstrap_type"] == "Bernoulli":
#        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
#
#    params['eval_metric'] = 'Accuracy'
#    params['cat_features'] = cat_features
#    params['random_state'] = config.seed
#    params['allow_writing_files'] = False
#    params["iterations"] = 5000
#
#    model = CatBoostClassifier(**params)
#    kf = KFold(n_splits=config.n_folds)
#
#    scores = list()
#    for idx, (train_ind, val_ind) in enumerate(kf.split(X, y)):
#        model = CatBoostClassifier(**params)
#        X_train = X.iloc[train_ind]
#        y_train = y.iloc[train_ind]
#        X_val = X.iloc[val_ind]
#        y_val = y.iloc[val_ind]
#
#        model.fit(
#            X_train,
#            y_train,
#            eval_set=[(X_val, y_val)],
#            early_stopping_rounds=400,
#            verbose=False,
#        )
#
#        val_pred = model.predict(X_val)
#        scores.append(accuracy_score(y_val, val_pred))
#
#    return np.mean(scores)
#
#
#study = optuna.create_study(direction="maximize")
#study.optimize(objective, n_trials=20)

In [15]:
# CatBoost Parameters
#params = {'subsample': 0.8,
#          'bootstrap_type':'Bernoulli',
#          'depth': 9,
#          'one_hot_max_size': 150}
#
#params['verbose'] = False
#params['random_state'] = config.seed
#params['cat_features'] = cat_features
#params['eval_metric'] = 'Accuracy'
#params['allow_writing_files'] = False
#params["iterations"] = 5000

# XGB Parameters
params = {'max_depth': 9,
          'min_child_weight': 12,
          'subsample': 0.8}

#params['eval_metric'] = 'Accuracy'
params['random_state'] = config.seed
params['tree_method'] = 'hist'
params['n_estimators'] = 5000
params['enable_categorical'] = True

kf = KFold(n_splits=config.n_folds)


In [16]:
X = train_set.drop(columns=[target], axis=1)
y = train_set[target]
features = X.columns

X_test = test_set.drop(columns=[target], axis=1)
y_oof = np.zeros(X.shape[0])
y_pred = np.zeros(X_test.shape[0])

In [None]:
for idx, (train_ind, val_ind) in enumerate(kf.split(X, y)):
    print(f"| Fold {idx+1} |".center(80, "-"))
    #model = CatBoostClassifier(**params)
    model = XGBClassifier(**params)
    X_train = X.iloc[train_ind]
    y_train = y.iloc[train_ind]
    X_val = X.iloc[val_ind]
    y_val = y.iloc[val_ind]

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=400,
        verbose=200,
    )

    plot_importances(model, features)

    val_pred = model.predict(X_val)
    y_oof[val_ind] += val_pred

    test_pred = model.predict(X_test)
    y_pred += test_pred / config.n_folds

    print(f'fold accuracy: {accuracy_score(y_val, val_pred)}')
    del X_train, y_train, X_val, y_val
    gc.collect()

val_score = accuracy_score(y, y_oof)
print(f'accuracy: {val_score}')

-----------------------------------| Fold 1 |-----------------------------------
[0]	validation_0-logloss:0.66567
[200]	validation_0-logloss:0.56470
[400]	validation_0-logloss:0.57328
[600]	validation_0-logloss:0.59317
[635]	validation_0-logloss:0.59694


In [139]:
sub[target] = np.where(y_pred >= 0.5, 1, 0).tolist()

sub.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,0
2,2110,0
3,55082,0
4,37165,0


In [140]:
#sub.to_csv(f'../submissions/submission_{round(val_score, 6)}.csv', index = False)