In [8]:
import os
import gc
import optuna
import warnings
import numpy as np
from utils import *
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
warnings.filterwarnings('ignore')
%run skills.ipynb
%run languages.ipynb
%run work_experiences.ipynb
%run education.ipynb

class config:
    train_path = '../../../datasets/garanti-bbva-data-camp/train_users.csv'
    test_path = '../../../datasets/garanti-bbva-data-camp/test_users.csv'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/skills.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/languages.csv'
    education_path = '../../../datasets/garanti-bbva-data-camp/education.csv'
    exp_path = '../../../datasets/garanti-bbva-data-camp/work_experiences.csv'
    seed = 42
    n_folds = 6

In [9]:
train_df = pd.read_csv(config.train_path)
test_df = pd.read_csv(config.test_path)
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')
df = train_df.append(test_df).reset_index(drop = True)
df.head()

train_df shape: (53019, 4)
test_df shape: (13255, 3)


Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1.0
1,6950,Internet,"Istanbul, Istanbul, Turkey",0.0
2,4880,Online Media,Turkey,0.0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0.0
4,11005,Banking,"Istanbul, Turkey",0.0


In [10]:
def fix_location(dataframe: pd.DataFrame, feature: str = 'location') -> pd.DataFrame:

    df_ = dataframe.copy()
    df_.loc[df_[feature].astype(str).str.contains('Kahraman Maras'), feature] = 'Kahramanmaras, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Şanliurfa'), feature] = 'Sanliurfa, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('İçel'), feature] = 'Mersin, Turkey'
    df_.loc[df_[feature].astype(str).str.contains('Afyon'), feature] = 'Afyonkarahisar, Turkey'
    df_[feature] = df_[feature].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_[feature] = df_[feature].apply(lambda x: x.upper().strip())
    df_[feature] = df_[feature].apply(lambda x: translation(str(x)))
    tr_cities = load_tr_cities()
    for city in tr_cities:
        df_[feature] = df_[feature].apply(lambda x: city if city in x else x)
        df_[f'{feature}_based_on_tr'] = df_[feature].apply(lambda x: 1 if x in tr_cities or x == 'TURKEY' else 0)

    return df_

df = fix_location(df)

skills_df = load_skills(config.skills_path, 120)
lang_df = load_languages(config.languages_path, 20)
edu_df = load_education(config.education_path, 80, 80)
exp_df = load_work_experiences(config.exp_path)

df = df.merge(skills_df, on = ['user_id'], how = 'left')
df = df.merge(lang_df, on = ['user_id'], how = 'left')
df = df.merge(edu_df, on = ['user_id'], how = 'left')
df = df.merge(exp_df, on = ['user_id'], how = 'left')

df['nunique_company_by_industries'] = df.groupby(by = 'industry')['company_id'].transform('nunique')

#df['employee_total_last_exp_diff'] = df['employee_total_experience'] - df['employee_last_experience']

#df['nunique_employees_by_industries'] = df.groupby(by = 'industry')['user_id'].transform('nunique')

print(df.shape)
df.head()

Frequency of top 20 skills before preprocess: 294433
Frequency of top 20 skills after preprocess: 325856
(66274, 319)


Unnamed: 0,user_id,industry,location,moved_after_2019,location_based_on_tr,skill_agile,skill_agile methodologies,skill_ajax,skill_analizi,skill_analysis,...,employee_last_experience_year,nunique_company,employee_avg_days_to_quit,employee_last_quit,company_id,company_avg_days_to_quit,company_std_days_to_quit,company_max_days_to_quit,company_med_days_to_quit,nunique_company_by_industries
0,1301,Information Technology and Services,ISTANBUL,1.0,1,1.0,1.0,0.0,0.0,0.0,...,2018.0,3.0,447.0,882.0,26.0,912.280537,808.719538,5206.0,701.0,3260.0
1,6950,Internet,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,2017.0,4.0,525.25,184.0,1337.0,393.769231,376.905194,1155.0,184.0,743.0
2,4880,Online Media,TURKEY,0.0,1,0.0,0.0,1.0,0.0,0.0,...,2017.0,4.0,456.5,273.0,4366.0,,,,,32.0
3,26046,Telecommunications,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,1.0,...,2014.0,1.0,3409.0,,26.0,912.280537,808.719538,5206.0,701.0,660.0
4,11005,Banking,ISTANBUL,0.0,1,0.0,0.0,0.0,0.0,0.0,...,2017.0,3.0,1136.333333,2435.0,1562.0,837.166667,767.031502,3957.0,609.0,424.0


In [11]:
#from kmodes.kmodes import KModes
#
#cluster = KModes(n_clusters=6, n_jobs=-1)
#
##df['kmodes_skill_label'] = cluster.fit_predict(df[[col for col in df.columns if col.startswith('skill')]].fillna(0))
##df['kmodes_degree_label'] = cluster.fit_predict(df[[col for col in df.columns if col.startswith('degree')]].fillna(0))
##df['kmodes_study_label'] = cluster.fit_predict(df[[col for col in df.columns if col.startswith('study')]].fillna(0))
##df['kmodes_language_label'] = cluster.fit_predict(df[[col for col in df.columns if col.startswith('language')]].fillna(0))
#
#df['kmodes_label'] = cluster.fit_predict(df[[col for col in df.columns if 
#                        col.startswith('skill') or
#                        col.startswith('degree') or
#                        col.startswith('study') or
#                        col.startswith('language')]].fillna(0))

#df[[col for col in df.columns if col.startswith('skill')]] = df[[col for col in df.columns if col.startswith('skill')]].fillna(0)
#df[[col for col in df.columns if col.startswith('degree')]] = df[[col for col in df.columns if col.startswith('degree')]].fillna(0)
#df[[col for col in df.columns if col.startswith('study')]] = df[[col for col in df.columns if col.startswith('study')]].fillna(0)
#df[[col for col in df.columns if col.startswith('language')]] = df[[col for col in df.columns if col.startswith('language')]].fillna(0)


In [12]:
target = 'moved_after_2019'
cat_features = ['industry', 'location', 'company_id', 'employee_last_experience_year', 'employee_last_experience_month']
drop_features = ['user_id']

for caterogical_col in cat_features:
    df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

train_set = df.loc[df[target].notnull()].drop(columns=drop_features, axis = 1)
test_set = df.loc[df[target].isnull()].drop(columns=drop_features, axis = 1)

print(f'train_set: {train_set.shape}')
print(f'test_set: {test_set.shape}')

train_set: (53019, 318)
test_set: (13255, 318)


## XGBoost

In [13]:
xgb_params = {'max_depth': 14,
              'subsample': 0.8412208136691927,
              #'eta': 0.02,
              #'reg_alpha': 2,
              #'reg_lambda': 24,
              #'max_cat_to_onehot': 160,
              'learning_rate': 0.017307888942046504
              }

#xgb_params['eval_metric'] = 'accuracy'
xgb_params['enable_categorical'] = True
xgb_params['random_state'] = config.seed
xgb_params['tree_method'] = 'hist'
xgb_params['n_estimators'] = 5000
xgb_params['booster'] = 'dart'
#xgb_params['objective'] = "binary:logistic"
#xgb_params['eval_metric'] = "auc"

#xgb_params = {'max_depth': 12,
#              'learning_rate': 0.02,
#              'n_estimators': 2960,
#              #'min_child_weight': 2,
#              'subsample': 0.9,
#              'tree_method': 'hist',
#              'enable_categorical': True,
#              'random_state': 42}

In [14]:
#kf = KFold(n_splits=config.n_folds)
kf = KFold(n_splits=config.n_folds, shuffle =True, random_state=config.seed)

X = train_set.drop(columns=[target], axis=1)
y = train_set[target]
features = X.columns

X_test = test_set.drop(columns=[target], axis=1)
y_oof = np.zeros(X.shape[0])
y_pred = np.zeros(X_test.shape[0])

scores = list()
for idx, (train_ind, val_ind) in enumerate(kf.split(X, y)):
    print(f"| Fold {idx+1} |".center(80, "-"))
    model = XGBClassifier(**xgb_params)
    X_train = X.iloc[train_ind]
    y_train = y.iloc[train_ind]
    X_val = X.iloc[val_ind]
    y_val = y.iloc[val_ind]

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds = 400,
        verbose = 200
    )

    plot_importances(model, features)

    val_pred = model.predict(X_val)
    y_oof[val_ind] += val_pred

    test_pred = model.predict(X_test)
    y_pred += test_pred / config.n_folds

    print(f'fold accuracy: {accuracy_score(y_val, val_pred)}')
    scores.append(accuracy_score(y_val, val_pred))
    del X_train, y_train, X_val, y_val
    gc.collect()

val_score = accuracy_score(y, y_oof)
print(f'accuracy: {val_score}')
print(f'folds avg accuracy: {np.mean(scores)}')

-----------------------------------| Fold 1 |-----------------------------------
[0]	validation_0-logloss:0.68952


KeyboardInterrupt: 

fold accuracy: 0.7873472159348122 <br>
accuracy: 0.7856994662290877 <br>
folds avg accuracy: 0.785699399374678 <br>

fold accuracy: 0.7887052965142598 <br>
accuracy: 0.7843226013315981 <br>
folds avg accuracy: 0.7843225760947662 <br>

In [None]:
#sub[target] = np.where(y_pred >= 0.5, 1, 0).tolist()

#sub.head()

In [None]:
#sub.to_csv(f'../submissions/submission_xgb_countvect_{round(val_score, 6)}.csv', index = False)

In [None]:
#from sklearn.ensemble import VotingClassifier
#
#cat_params = {'one_hot_max_size': 156,
#              'depth': 12,
#              'boosting_type': 'Plain',
#              'bootstrap_type': 'MVS'}
#
#cat_params["iterations"] = 2500
##cat_params["early_stopping_rounds"] = 400
#cat_params['random_state'] = config.seed
#cat_params['cat_features'] = cat_features
#cat_params['eval_metric'] = 'Accuracy'
#cat_params['allow_writing_files'] = False
#cat_params['verbose'] = False
#
#xgb_params = {'max_depth': 12,
#              'subsample': 0.8,
#              'learning_rate': 0.01}
#
##xgb_params["early_stopping_rounds"] = 400
#xgb_params['enable_categorical'] = True
#xgb_params['random_state'] = config.seed
#xgb_params['tree_method'] = 'hist'
#xgb_params['n_estimators'] = 2500
#
#cat = CatBoostClassifier(**cat_params)
#xgb = XGBClassifier(**xgb_params)


In [None]:
#kf = KFold(n_splits=config.n_folds)
#
#X = train_set.drop(columns=[target], axis=1)
#y = train_set[target]
#features = X.columns
#
#X_test = test_set.drop(columns=[target], axis=1)
#y_oof = np.zeros(X.shape[0])
#y_pred = np.zeros(X_test.shape[0])
#
#scores = list()
#for idx, (train_ind, val_ind) in enumerate(kf.split(X, y)):
#    print(f"| Fold {idx+1} |".center(80, "-"))
#    model = VotingClassifier(estimators=[('xgb', xgb), ('cat', cat)], voting='soft', weights=[0.54, 0.46])
#    X_train = X.iloc[train_ind]
#    y_train = y.iloc[train_ind]
#    X_val = X.iloc[val_ind]
#    y_val = y.iloc[val_ind]
#
#    model.fit(X_train, y_train,)
#
#    val_pred = model.predict(X_val)
#    y_oof[val_ind] += val_pred
#
#    test_pred = model.predict(X_test)
#    y_pred += test_pred / config.n_folds
#
#    print(f'fold accuracy: {accuracy_score(y_val, val_pred)}')
#    scores.append(accuracy_score(y_val, val_pred))
#    del X_train, y_train, X_val, y_val
#    gc.collect()
#
#val_score = accuracy_score(y, y_oof)
#print(f'accuracy: {val_score}')
#print(f'folds avg accuracy: {np.mean(scores)}')

In [None]:
#sub1 = pd.read_csv('../submissions/submission_xgb_0.782946.csv')
#sub2 = pd.read_csv('../submissions/submission_xgb_0.785756.csv')
#sub3 = pd.read_csv('../submissions/submission_xgb_0.784323.csv')
#sub4 = pd.read_csv('../submissions/submission_xgb_0.782493.csv')

#sub1

In [None]:
#sub['moved_after_2019'] = np.where((sub1['moved_after_2019'] + sub2['moved_after_2019']) / 4 >= 0.5, 1, 0).tolist()
#
#sub

In [None]:
#sub.to_csv(f'../submissions/ensemble_3.csv', index = False)