In [None]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import f1_score, log_loss, roc_auc_score, average_precision_score

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

%matplotlib inline
plt.style.use('seaborn')


In [None]:
train = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
train.drop_duplicates(inplace=True)
train['gender'].replace(['Other'], np.nan, inplace=True)
train.head()

In [None]:
train.info()

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(20, 20))

targetGroup = train.groupby('target')

(targetGroup['gender'].value_counts()/len(train['gender'])).unstack().plot(kind='bar', ax=ax[0,0]);
targetGroup['relevent_experience'].value_counts().unstack().plot(kind='bar', ax=ax[0, 1]);
targetGroup['enrolled_university'].value_counts().unstack().plot(kind='bar', ax=ax[1, 0]);
(targetGroup['education_level'].value_counts()/len(train['education_level'])).unstack().plot(kind='bar', ax=ax[1, 1]);

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 12))

ax[0].pie(targetGroup['city'].value_counts()[0],
         labels=targetGroup['city'].value_counts()[0].index);
ax[1].pie(targetGroup['city'].value_counts()[1],
         labels=targetGroup['city'].value_counts()[1].index);

ax[0].set_title('Percentage Candidate Not Looking for a job change based on City');
ax[1].set_title('Percentage Candidate Looking for a job change based on City ');

plt.show()

In [None]:
train['experience'] = train['experience'].replace(to_replace = '[<>]', value = '', regex=True)
train['experience'] = pd.to_numeric(train['experience'], errors='coerce')
train['last_new_job'] = pd.to_numeric(train['last_new_job'], errors='coerce')


In [None]:
fig, ax = plt.subplots(1, 3, figsize=(18, 8))
sns.distplot(train.training_hours, ax=ax[0]);
sns.distplot(train.experience, ax=ax[1]);
sns.distplot(train.city_development_index, ax=ax[2])

plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 8))

sns.barplot(x='relevent_experience', y='experience', data=train, hue='target', ax = ax[0]);
sns.barplot(x='major_discipline', y='experience', data=train, hue='target', ax=ax[1]);

In [None]:
train.company_size.unique()

In [None]:
#https://www.sangoma.com/articles/smb-sme-large-enterprise-size-business-matters/

train['company_size'].replace(['50-99', '<10', '10/49'], 'SMB', inplace=True)
train['company_size'].replace(['100-500', '500-999'], 'SME', inplace=True)
train['company_size'].replace(['10000+', '5000-9999', '1000-4999'], 'Large enterprise', inplace=True)

In [None]:
train.groupby('company_size')['target'].value_counts().unstack().plot(kind='bar');

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(14, 18))
train.groupby(['relevent_experience', 'experience'])['training_hours'].count().unstack(0).plot(kind='bar', ax=ax[0]);
train.groupby(['relevent_experience', 'experience'])['target'].value_counts().unstack(1).plot(kind='bar', ax=ax[1]);


In [None]:
plt.figure(figsize=(15, 15))

sns.boxplot(y='experience',
           x='company_type', data=train,
           hue='company_size');

In [None]:
X = train.drop(['enrollee_id', 'target'], axis=1).copy()
y = train['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [None]:
numericalPipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaling', StandardScaler())
])

ordinalPipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

categoricalPipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

In [None]:
preprocessor = ColumnTransformer([
    ('numeric', numericalPipeline, ['city_development_index', 'training_hours', 'experience']),
    ('ordinal', ordinalPipeline, ['last_new_job']),
    ('categorical', categoricalPipeline, ['city', 'gender', 'relevent_experience',
                                         'enrolled_university', 'education_level', 'major_discipline',
                                         'company_size', 'company_type'])

])

In [None]:
preprocessor.fit(X)
X_prep = preprocessor.transform(X)
X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)

X_array = X_prep.toarray()
X_train_array = X_train_prep.toarray()
X_test_array = X_test_prep.toarray()


In [None]:
pca = PCA(0.95)
X_red = pca.fit_transform(X_array)
X_train_red =  pca.transform(X_train_array)
X_test_red = pca.transform(X_test_array)

In [None]:
def FunctionLoss(model,
                X=X_train_red,
                y=y_train,
                X_test=X_test_red,
                y_test=y_test):
    model.fit(X, y)
    trainPred = model.predict(X)
    testPred = model.predict(X_test)
    trainLoss = log_loss(y, trainPred)
    testLoss = log_loss(y_test, testPred)
    
    trainF1 = f1_score(y, trainPred)
    testF1 = f1_score(y_test, testPred)
    
    try:
        trainScore = roc_auc_score(y, model.predict_proba(X)[:, 1])
        testScore = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    except:
        trainScore = roc_auc_score(y, model.decision_function(X))
        testScore = roc_auc_score(y_test, model.decision_function(X_test))
        
    print(f'Train Loss: {trainLoss} -- Test Loss: {testLoss}\t')
    print(f'Train F1 Score: {trainF1} -- Test F1 Score: {testF1}\t')
    print(f'Train AUC: {trainScore} -- Test AUC: {testScore}\t')
    
    
def FunctionCV(model, X=X_red, y=y):
    roc_auc = cross_val_score(model, X, y, cv=StratifiedKFold(n_splits=5),
                             scoring='roc_auc')
    print(f'AUC Scoring: {roc_auc}\t')
    print(f'AUC Mean:{roc_auc.mean()}')

In [None]:
models = {'Logistic Regression':LogisticRegression(),
        'SGD':SGDClassifier(),
        'SVC':SVC(),
        'KNN': KNeighborsClassifier(),
        'Decision Tree':DecisionTreeClassifier(),
        'Random Forest':RandomForestClassifier(),
        'Xgboost':XGBClassifier(),
        'LightGBM':LGBMClassifier()}

for name, model in models.items():
    print(f'Model Name : {name}')
    FunctionLoss(model=model)
    print('\t')

In [None]:
for name, model in models.items():
    print(f'Model Name : {name}')
    FunctionCV(model=model)
    print('\t')

In [None]:
param_grid = [
    {'learning_rate':[0.1, 1e-2, 1e-3], 
     'max_depth':[0, -1], 
     'n_estimators':[1, 10, 100],
    'boosting_type':('gbdt', 'dart'),
    'reg_alpha':[0, 0.1, 0.5],
    'reg_lambda':[0, 0.1, 0.5],
    'class_weight':('balanced', None)}
]

grid = GridSearchCV(models['LightGBM'], param_grid, 
                    cv=StratifiedKFold(n_splits=5), 
                    scoring='roc_auc')

grid.fit(X_train_red, y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
print('Test Set AUC: {:.3f}'.format(roc_auc_score(y_test, grid.predict_proba(X_test_red)[:,1])))
print('Test Set Accuracy: {:.3f}'.format(grid.score(X_test_red, y_test)))

In [None]:
params_grid2 = [{'boosting_type':['dart'],
                  'class_weight':['balanced'],
                  'learning_rate': [0.1],
                  'max_depth':[0],
                  'n_estimators':[1],
                 'reg_alpha':[0.1, 0.01, 0.5, 0.6, 0.7, 0.8, 0.9],
                 'reg_lambda':[0.1, 0.01, 0.5, 0.6, 0.7, 0.8, 0.9]
                }]
grid2 = GridSearchCV(models['LightGBM'], params_grid2, 
                    cv=StratifiedKFold(n_splits=5), 
                    scoring='roc_auc')

grid2.fit(X_train_red, y_train)

In [None]:
print(f'Best Scores: {grid2.best_score_}')
print(f'Best Params: {grid2.best_params_}')
print('Test Set AUC: {:.3f}'.format(roc_auc_score(y_test, grid2.predict_proba(X_test_red)[:,1])))
print('Test Set Accuracy: {:.3f}'.format(grid2.score(X_test_red, y_test)))

In [None]:
SelectedModel = grid.best_estimator_
SelectedModel.fit(X_red, y)

In [None]:
def Testing(df, submit):
    df.drop_duplicates(inplace=True)
    df['gender'].replace(['Other'], np.nan, inplace=True)
    df['experience'] = df['experience'].replace(to_replace = '[<>]', value = '', regex=True)
    df['experience'] = pd.to_numeric(df['experience'], errors='coerce')
    df['last_new_job'] = pd.to_numeric(df['last_new_job'], errors='coerce')
    df['company_size'].replace(['50-99', '<10', '10/49'], 'SMB', inplace=True)
    df['company_size'].replace(['100-500', '500-999'], 'SME', inplace=True)
    df['company_size'].replace(['10000+', '5000-9999', '1000-4999'], 'Large enterprise', inplace=True)
    X = df.drop(['enrollee_id'], axis=1).copy()
    X = preprocessor.transform(X)
    X = X.toarray()
    X = pca.transform(X)
    
    proba = SelectedModel.predict_proba(X)[:,1]
    submit['target'] = proba
    submit.to_csv('submission.csv', index=False)
    
    
test= pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
sample_submission = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/sample_submission.csv')
Testing(test, sample_submission)
    

In [None]:
submission = pd.read_csv('submission.csv')
submission.head(10)