# Business Understanding

A company which is active in Big Data and Data Science wants to hire data scientists among people who successfully pass some courses which conduct by the company. Many people signup for their training. Company wants to know which of these candidates are really wants to work for the company after training or looking for a new employment because it helps to reduce the cost and time as well as the quality of training or planning the courses and categorization of candidates. Information related to demographics, education, experience are in hands from candidates signup and enrollment.

# Data Understanding

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
df.head()

In [None]:
df.info()

In [None]:
# Missing Value Screening

print("Number of Missing Values in Each Category:")
print(df.isna().sum())

In [None]:
plt.subplots(figsize=(10,5))
sns.heatmap(df.isnull(),cbar=False)
plt.xlabel('Feature')
plt.ylabel('Index')
plt.title('Missing Value Visualization')

plt.show()

In [None]:
# Imbalance Dataset Checking

df['target'].value_counts()

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(df['target'])
plt.xlabel('Target Label')
plt.ylabel('Count')
plt.title('Target Label Comparison')

plt.show()

# Data Preparation

## Extract City Code 

In [None]:
def city_code(x):
    y = x.str.split("_",expand=True)[1].astype('int64')
    return y

In [None]:
df['city'] = city_code(df['city'])

In [None]:
df.head()

## Missing Value Treatment

In [None]:
def null_fill(x):
    return x.fillna('Unknown',inplace=True)

In [None]:
null_fill(df)
df.info()

## Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
ord_pipe = Pipeline([
    ('ord_encode',OrdinalEncoder([
                                    ['Unknown', 'Primary School',  'High School', 'Graduate', 'Masters', 'Phd'],
                                    'Unknown,<1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,>20'.split(','),
                                    ['Unknown','<10', '10/49', '50-99', '100-500', '500-999' ,'1000-4999', '5000-9999', '10000+'],
                                    ['Unknown','1', '2', '3', '4', '>4', 'never']
                                ]))
])

In [None]:
ord_column = ['education_level','experience','company_size','last_new_job']
card_column = ['gender','relevent_experience','enrolled_university','major_discipline','company_type'] 

In [None]:
def encoder(df,ord_column,card_column):
    ord_pipe.fit(df[ord_column])
    df[ord_column] = ord_pipe.transform(df[ord_column])
    df = pd.get_dummies(df,columns=card_column)
    return df

In [None]:
df = encoder(df,ord_column,card_column)

In [None]:
df.head()

## Balancing Dataset 

In [None]:
X = df.drop(columns=['enrollee_id','target'])
y = df.target

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Fungsi
def balancing(X,y,smote_ratio,rus_ratio):
    smote = SMOTE(sampling_strategy=smote_ratio)
    X_sm, y_sm = smote.fit_sample(X, y)
    rus = RandomUnderSampler(sampling_strategy=rus_ratio)
    X, y = rus.fit_sample(X_sm,y_sm)
    return X,y

In [None]:
X,y = balancing(X,y,1,1)

In [None]:
y.value_counts()

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(y)
plt.xlabel('Target Label')
plt.ylabel('Count')
plt.title('Target Label Comparison')

plt.show()

## Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)

cols = selector.get_support(indices=True)
X_new = X.iloc[:,cols]

In [None]:
X_new.columns

In [None]:
X_new.head()

# Modeling

## Dataset Splitting 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size=0.2,stratify=y,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

## Numerical Feature Treatment

In [None]:
from sklearn.preprocessing import StandardScaler, PowerTransformer

In [None]:
num_pipe = Pipeline([
    ('scaler',StandardScaler())
])

In [None]:
num_column = ['city','city_development_index']

In [None]:
def transform_scaling(X,num_column):
    num_pipe.fit(X[num_column])
    X[num_column] = num_pipe.transform(X[num_column])
    return X

In [None]:
X_train = transform_scaling(X_train,num_column)
X_test = transform_scaling(X_test,num_column)

In [None]:
X_train.head()

## Model Selection 

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
models = Pipeline([
    ('clf', None), 
])

search = GridSearchCV(
    models, 
    cv=3, 
    return_train_score=False, 
    scoring=['accuracy', 'recall', 'f1'],
    refit = 'f1',
    param_grid={
        'clf': [GaussianNB(), RandomForestClassifier(), LogisticRegression(), XGBClassifier()]        
})

In [None]:
search.fit(X_train,y_train)

print('Best model:', search.best_params_)
model_comparison = pd.DataFrame(search.cv_results_)
model_comparison

In [None]:
model_comparison['model'] = ['Naive Bayes', 'Random Forest', 'Logistic Regression', 'XGB Classifier']

fig, ax = plt.subplots(figsize=(10,5))
ax = sns.barplot('model', 'mean_test_accuracy', data=model_comparison, capsize=.05, palette='Blues', ci=None)
ax.set_xlabel("Models",fontsize=12)
ax.set_ylabel("Accuracy (%)",fontsize=12)
ax.tick_params(labelsize=12)
ax.axes.set_title("Accuracy Between Models", fontsize=12)

plt.show()

## Hyperparameter Tuning 

In [None]:
param_xgb = {
    'max_depth':np.arange(1,4),
    'n_estimators':np.arange(169,171),
    'gamma':np.arange(8,11),
    'reg_alpha':np.linspace(0.33,0.34,100),
    'reg_lambda':np.linspace(0.15,0.16,100)
}

model_selected = XGBClassifier(n_jobs=-1,random_state=42)

In [None]:
model_xgb = RandomizedSearchCV(model_selected,param_xgb,cv=3,n_iter=100,n_jobs=-1,verbose=1,random_state=42)
model_xgb.fit(X_train,y_train)

print(model_xgb.best_params_)
print("Train data accuracy score: ", model_xgb.score(X_train,y_train))
print("Test data accuracy score: ", model_xgb.score(X_test,y_test))

# Evaluation

In [None]:
from sklearn.metrics import plot_confusion_matrix, classification_report, roc_auc_score, roc_curve

In [None]:
y_pred = model_xgb.predict(X_test)

In [None]:
# Confusion Matrix

fig,ax = plt.subplots(figsize=(10,5))
plot_confusion_matrix(model_xgb,X_test,y_test,cmap=plt.cm.Blues,normalize='true',ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix (Normalized)')

In [None]:
# Classification Report

print(classification_report(y_test,y_pred))

In [None]:
# ROC-AUC Score

roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC score is",roc_auc)

In [None]:
y_pred_proba = model_xgb.predict_proba(X_test)

In [None]:
a = [i[1] for i in y_pred_proba]

In [None]:
fpr,tpr,_ = roc_curve(y_test, a)

In [None]:
fig = plt.subplots(figsize=(7,7))

plt.plot([0, 1],[0, 1],color='navy',linestyle='--')
plt.plot(fpr,tpr,color='orange',label='ROC curve (AUC = {})'.format(round(roc_auc,3)))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.title('ROC Curve')

plt.show()

# Prediction

In [None]:
df_test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
df_test.head()

In [None]:
df_test.info()

## Preparation

In [None]:
df_test['city'] = city_code(df_test['city'])

In [None]:
null_fill(df_test)

In [None]:
df_test = encoder(df_test,ord_column,card_column)

In [None]:
df_test = df_test[['city', 'city_development_index', 'experience', 'company_size',
                   'gender_Male', 'relevent_experience_Has relevent experience',
                   'enrolled_university_no_enrollment', 'major_discipline_Unknown',
                   'company_type_Pvt Ltd', 'company_type_Unknown']]

In [None]:
df_test = transform_scaling(df_test,num_column)

In [None]:
df_test.head(6)

In [None]:
# Predict!

y_test_pred = model_xgb.predict(df_test)

In [None]:
y_test_pred_proba = model_xgb.predict_proba(df_test)

In [None]:
pd.DataFrame(y_test_pred_proba)

## Submission 

In [None]:
pd.DataFrame(y_test_pred).to_csv('submission.csv',index=False)