In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler as MMS
from catboost import CatBoostClassifier as CBC
from sklearn.metrics import roc_auc_score

In [None]:
train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
test_target = np.load('../input/job-change-dataset-answer/jobchange_test_target_values.npy')
submission = pd.DataFrame(columns=['enrollee_id','target'])
submission['enrollee_id'] = test['enrollee_id'].values

In [None]:
train.drop(columns=['enrollee_id'],inplace=True)
test.drop(columns=['enrollee_id'],inplace=True)
columns = train.columns
cat_columns, num_columns = [], []
for col in columns:
    if str(type(train.loc[1,col])) == "<class 'str'>":
        cat_columns.append(col)
    else:
        num_columns.append(col)
train.fillna({col:'#other' for col in cat_columns},inplace=True)
test.fillna({col:'#other' for col in cat_columns},inplace=True)
train.replace(to_replace=['>20','<1','>4','never'],value=['21','0','5','0'],inplace=True)
test.replace(to_replace=['>20','<1','>4','never'],value=['21','0','5','0'],inplace=True)

In [None]:

train['gender'] = train['gender'].apply(lambda x: 'Other' if x == '#other' else x)
train['enrolled_university'] = train['enrolled_university'].apply(lambda x: 'no_enrollment' if x == '#other' else x)
train['education_level'] = train['education_level'].apply(lambda x: 'Primary School' if x == '#other' else x)
train['major_discipline'] = train['major_discipline'].apply(lambda x: 'Other' if x == '#other' else x)
train['experience'] = train['experience'].apply(lambda x: '0' if x == '#other' else x)
train['company_size'] = train['company_size'].apply(lambda x: '<10' if x == '#other' else x)
train['company_type'] = train['company_type'].apply(lambda x: 'Other' if x == '#other' else x)
train['last_new_job'] = train['last_new_job'].apply(lambda x: '0' if x == '#other' else x)


test['gender'] = test['gender'].apply(lambda x: 'Other' if x == '#other' else x)
test['enrolled_university'] = test['enrolled_university'].apply(lambda x: 'no_enrollment' if x == '#other' else x)
test['education_level'] = test['education_level'].apply(lambda x: 'Primary School' if x == '#other' else x)
test['major_discipline'] = test['major_discipline'].apply(lambda x: 'Other' if x == '#other' else x)
test['experience'] = test['experience'].apply(lambda x: '0' if x == '#other' else x)
test['company_size'] = test['company_size'].apply(lambda x: '<10' if x == '#other' else x)
test['company_type'] = test['company_type'].apply(lambda x: 'Other' if x == '#other' else x)
test['last_new_job'] = test['last_new_job'].apply(lambda x: '0' if x == '#other' else x)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
train['training_hours'] = train['training_hours'].astype(float)
test['training_hours'] = test['training_hours'].astype(float)

In [None]:
scaler = MMS()
scaler.fit(train[num_columns[:-1]])
train[num_columns[:-1]], test[num_columns[:-1]] = scaler.transform(train[num_columns[:-1]]), scaler.transform(test[num_columns[:-1]])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
x, y = train[train.columns[:-1]], train['target']

In [None]:
model = CBC(iterations=100,
            depth=2,
            learning_rate=1,
            loss_function='Logloss',
            verbose=True,
            custom_metric=['Logloss', 'AUC'])
history = model.fit(x,y,cat_features=cat_columns)

In [None]:
preds_train, preds_test = model.predict_proba(x)[:,1], model.predict_proba(test)[:,1]
print(f'ROC AUC score on train set- {roc_auc_score(y, preds_train)*100:.2f}%')
print(f'ROC AUC score on train set- {roc_auc_score(test_target, preds_test)*100:.2f}%')