In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score as cv
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Introduction**

In [None]:
train_df = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test_df = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
d = {'na_count':train_df.isna().sum()}
train_na = pd.DataFrame(data =d)
train_na['percentage'] = train_df.isna().sum()/19158 * 100
train_na

In [None]:
sns.heatmap(train_df.isnull(), cbar=False)

In [None]:
d = {'na_count':test_df.isna().sum()}
test_na = pd.DataFrame(data=d)
test_na['percentage'] = test_df.isna().sum() / 2129 * 100
test_na

In [None]:
sns.heatmap(test_df.isnull(), cbar=False)

In [None]:
train_df.head()

**Preprocessing**

In [None]:
train_df.last_new_job.unique()

In [None]:
train_df.company_type.unique()

In [None]:
train_df.company_size.unique()

In [None]:
train_df.city.unique()

In [None]:
# Firstly, we observed that the city column has so many unique value and problem can be occur during one hot encoder process

In [None]:
train_df = train_df.drop('city',axis=1)
test_df = test_df.drop('city',axis=1)

In [None]:
# If any column has more than 15% Null values, it might be drop
train_df = train_df.drop(['company_size','company_type'],axis=1)
test_df = test_df.drop(['company_size','company_type'],axis=1)


In [None]:
def distribution_plot(data,column):
    sns.countplot(data=data, x=column)
    plt.show()

In [None]:
object_list = list(train_df.select_dtypes(include=['object']).columns)
object_list
for i in object_list:
    distribution_plot(train_df,i)

In [None]:
train_df['relevent_experience'] = train_df['relevent_experience'].replace({'Has relevent experience':1,'No relevent experience':0})
test_df['relevent_experience'] = test_df['relevent_experience'].replace({'Has relevent experience':1,'No relevent experience':0})

In [None]:
train_df['last_new_job'] = train_df['last_new_job'].replace({'never':0,'>4':5}).astype('float')
test_df['last_new_job'] = test_df['last_new_job'].replace({'never':0,'>4':5}).astype('float')

In [None]:
# We have handled information regarding experience and last_new_job, how often they are changing job ?
train_df['experience']= train_df['experience'].replace({'<1':0,'>20':21}).astype('float')
test_df['experience'] = test_df['experience'].replace({'<1':0,'>20':21}).astype('float')

train_df['experience_per_job'] = train_df['experience'] / [x + 1 for x in train_df['last_new_job']]
test_df['experience_per_job'] = test_df['experience'] / [x + 1 for x in test_df['last_new_job']]

**Pipeline**

In [None]:
numerical_cols = test_df.select_dtypes(exclude = ['object']).columns
numerical_cols = numerical_cols[1:-1]
categorical_cols = test_df.select_dtypes(include = ['object'] ).columns

imp_mean_numerical = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_most_frequent_categorical = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

numerical_transformer_imputer = imp_mean_numerical

categorical_transformer_simple = Pipeline(steps=[
    ('imputer',imp_most_frequent_categorical),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])



data_transformer_simple = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_imputer, numerical_cols),
        ('cat', categorical_transformer_simple, categorical_cols)
    ])



In [None]:
test_df = test_df.iloc[:,1:]
test_df

In [None]:
X = train_df.loc[:,train_df.columns != 'target']
y = train_df.loc[:,train_df.columns == 'target']

In [None]:
X = X.drop('enrollee_id',axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

**XGBOOST**

In [None]:
target_pipeline_xgbclas = Pipeline(steps=[
                                    ('preprocessor',data_transformer_simple),
                                    ('model',XGBClassifier())
])

In [None]:
params = {
    'model__colsample_bytree': [0.3, 0.7],
    'model__n_estimators': [25,50,100],
    'model__max_depth': range(4,8),
    
}

In [None]:
randomized_model = RandomizedSearchCV(target_pipeline_xgbclas,params,cv=3,n_jobs=-1,verbose=2)

In [None]:
randomized_model.fit(X_train,y_train)

In [None]:
print("Best parameters found: ", randomized_model.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_model.best_score_)))

In [None]:
cv_results = cv(randomized_model.best_estimator_,X,y,cv=5)

In [None]:
print('XGBOOST Mean of Cross validation is = ' + str(cv_results.mean()))

In [None]:
test_predictions = randomized_model.best_estimator_.predict(X_test)

# Create and print the confusion matrix
cm = confusion_matrix(y_test, test_predictions)
print(cm)

# Print the true positives (actual 1s that were predicted 1s)
print("The XGBOOST number of true positives is: {}".format(cm[1,1]))

In [None]:
score = precision_score(y_test, test_predictions)

# Print the final result
print("The XGBOOST precision value is {0:5f}".format(score))

In [None]:
# Check train and test mean absolute error, if there is equality there is no underfitting

print('The training error is {0:.5f}'.format(
  mae(y_train, randomized_model.best_estimator_.predict(X_train))))
print('The XGBOOST testing error is {0:.5f}'.format(
  mae(y_test, randomized_model.best_estimator_.predict(X_test))))

In [None]:
# Let the check f1 score 
print('The XGBOOST f1_score  is {0:.5f}'.format(f1_score(y_test,test_predictions)
  ))

**RandomForest**

In [None]:
target_pipeline_rf = Pipeline(steps=[
                                    ('preprocessor',data_transformer_simple),
                                    ('model',RandomForestClassifier())
])



In [None]:
rf_params = {'model__max_depth' :[2,5,8,19],
            'model__max_features':[2,5,8],
            'model__n_estimators':[10,500,1000],
            'model__min_samples_split':[2,5,10]}

In [None]:
random_forest_randomized = RandomizedSearchCV(target_pipeline_rf,rf_params,cv=3,n_jobs=-1,verbose=2)

In [None]:
random_forest_randomized.fit(X_train,y_train)

In [None]:
print("Best parameters found: ", random_forest_randomized.best_params_)
print("Lowest  rmse found: ", np.sqrt(np.abs(random_forest_randomized.best_score_)))

In [None]:
cv_results = cv(random_forest_randomized.best_estimator_,X,y,cv=10)

In [None]:
print('RandomForest Mean of Cross validation is = ' + str(cv_results.mean()))

In [None]:
test_predictions = random_forest_randomized.best_estimator_.predict(X_test)

# Create and print the confusion matrix
cm = confusion_matrix(y_test, test_predictions)
print(cm)

# Print the true positives (actual 1s that were predicted 1s)
print("The RandomForest number of true positives is: {}".format(cm[1,1]))

In [None]:
score = precision_score(y_test, test_predictions)

# Print the final result
print("The RandomForest precision value is {0:.5f}".format(score))

In [None]:
# Check train and test mean absolute error, if there is equality there is no underfitting

print('The training error is {0:.5f}'.format(
  mae(y_train, random_forest_randomized.best_estimator_.predict(X_train))))
print('The RandomForest testing error is {0:.5f}'.format(
  mae(y_test, random_forest_randomized.best_estimator_.predict(X_test))))

In [None]:
# Let the check f1 score 
print('The RandomForest f1_score  is {0:.5f}'.format(f1_score(y_test,test_predictions)
  ))

In [None]:
# XGboost and RandomForest looks very similar

In [None]:
target = randomized_model.best_estimator_.predict(test_df).astype('int')

In [None]:
test_df = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

In [None]:
test_df['target'] = target

In [None]:
submission_csv =test_df.iloc[:,[0,-1]]

In [None]:
submission_csv.to_csv("/kaggle/working/submission_csv", index = False)

In [None]:
print(submission_csv.shape)
print(submission_csv.isnull().sum())

In [None]:
submission_csv


In [None]:
feature_importance = randomized_model.best_estimator_._final_estimator.feature_importances_
feature_imp = pd.DataFrame(sorted(zip(feature_importance,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(50, 40))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('GBM Features (avg over folds)')
plt.tight_layout()
plt.show()

In [None]:
feature_imp = feature_imp.rename(columns={'Value':'Percentage'})

In [None]:
feature_imp

In [None]:
# As you know we created new column As 'experience_per_job',then it become our 3rd important feature