In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Context and Content
A company which is active in Big Data and Data Science wants to hire data scientists among people who successfully pass some courses which conduct by the company. Many people signup for their training. Company wants to know which of these candidates are really wants to work for the company after training or looking for a new employment because it helps to reduce the cost and time as well as the quality of training or planning the courses and categorization of candidates. Information related to demographics, education, experience are in hands from candidates signup and enrollment.

This dataset designed to understand the factors that lead a person to leave current job for HR researches too. By model(s) that uses the current credentials,demographics,experience data you will predict the probability of a candidate to look for a new job or will work for the company, as well as interpreting affected factors on employee decision.

The whole data divided to train and test . Target isn't included in test but the test target values data file is in hands for related tasks. A sample submission correspond to enrollee_id of test set provided too with columns : enrollee _id , target

## Note:

- The dataset is imbalanced.
- Most features are categorical (Nominal, Ordinal, Binary), some with high cardinality.
- Missing imputation can be a part of your pipeline as well.

## Features

- enrollee_id : Unique ID for candidate

- city: City code

- city_development_index : Developement index of the city (scaled)

- gender: Gender of candidate

- relevent_experience: Relevant experience of candidate

- enrolled_university: Type of University course enrolled if any

- education_level: Education level of candidate

- major_discipline :Education major discipline of candidate

- experience: Candidate total experience in years

- company_size: No of employees in current employer's company

- company_type : Type of current employer

- lastnewjob: Difference in years between previous job and current job

- training_hours: training hours completed

- target: 0 – Not looking for job change, 1 – Looking for a job change

## Inspiration
- Predict the probability of a candidate will work for the company
- Interpret model(s) such a way that illustrate which features affect candidate decision

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# read train dataframe:
df_train = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
df_train.head()

In [None]:
## show type and number of non Null values from features:
df_train.info()

In [None]:
## count target values:
print("Absolute Observation:\n{}".format(df_train.target.value_counts()))
print("\nRelative Observation:\n{}".format(df_train.target.value_counts() / df_train.shape[0]))

### we are in a situation of unbalanced data...

## Basic EDA for understand what kind of transformation do

In [None]:
df_train.city.value_counts()

In [None]:
# cast to int city feature:
df_train['city'] = [int(i[1]) for i in df_train.city.str.split("_")]

In [None]:
# Replace Nan values with label Unknown_gender 
df_train['gender'] = df_train.gender.fillna("Unknown_gender")
plt.barh(df_train.gender.value_counts().index, df_train.gender.value_counts().values)
plt.show()

In [None]:
# Map on dummies relevent_experience feature:
df_train.relevent_experience = df_train.relevent_experience.map({"Has relevent experience":1., "No relevent experience":0.,})
plt.barh(df_train.relevent_experience.value_counts().index, df_train.relevent_experience.value_counts().values)

In [None]:
# Replace Nan values with label Unknown and Map enrolled_university feature:
df_train.enrolled_university = df_train.enrolled_university.fillna("Unknown")
df_train.enrolled_university = df_train.enrolled_university.map({"Unknown":1., "Part time course":2., "Full time course":3., "no_enrollment":0.})
plt.barh(df_train.enrolled_university.value_counts().index, df_train.enrolled_university.value_counts().values)

In [None]:
# Replace Nan values with label Unknown and Map education_level feature:
df_train.education_level = df_train.education_level.fillna("Unknown")
df_train.education_level = df_train.education_level.map({"Unknown":0., "Primary School":1., "High School":2., "Graduate":3., "Masters":4., "Phd":5.})
plt.barh(df_train.education_level.value_counts().index, df_train.education_level.value_counts().values)

In [None]:
# Replace Nan values with label Unknown_discipline in major_discipline feature:
df_train.major_discipline = df_train.major_discipline.fillna("Unknown_discipline")
plt.barh(df_train.major_discipline.value_counts().index, df_train.major_discipline.value_counts().values)

In [None]:
##  Experience:
##  Replace Nan values with -1 (cannot be forgotten!!! it's crucial information!!!).
##  Replace >20 values with 21
##  Replace <1 values with 0
array_experience = []
for i in df_train.experience:
    if i == ">20": array_experience.append(21.)
    elif i == "<1": array_experience.append(0.)
    else: 
        try: array_experience.append(float(int(i)))
        except: array_experience.append(i)

df_train.experience = array_experience
df_train.experience.fillna(-1., inplace=True)
plt.barh(df_train.experience.value_counts().index, df_train.experience.value_counts().values)

In [None]:
# Replace Nan values with label Unknown in company_size feature:
df_train.company_size.fillna("Unknown", inplace=True)
plt.barh(df_train.company_size.value_counts().index, df_train.company_size.value_counts().values)

In [None]:
# Map company_size values:
df_train.company_size = df_train.company_size.map({"Unknown": 0., "<10": 1., "10/49":2., "50-99": 3., "100-500":4. , "500-999":5., "1000-4999":6., "5000-9999": 7., "10000+": 8.})
plt.barh(df_train.company_size.value_counts().index, df_train.company_size.value_counts().values)

In [None]:
# Replace Nan values with label Unknown_company_type in company_type feature:
df_train.company_type.fillna("Unknown_company_type", inplace=True)
plt.barh(df_train.company_type.value_counts().index, df_train.company_type.value_counts().values)

In [None]:
## Raplace Nan values with label never. 
df_train.last_new_job.fillna("never", inplace=True)
df_train.last_new_job = df_train.last_new_job.map({"4":4., "3":3., "2":2., "1":1., ">4":5., "never":0.})
plt.barh(df_train.last_new_job.value_counts().index, df_train.last_new_job.value_counts().values)

In [None]:
df_train.head()

In [None]:
df_train.info()

## Cast to dummy following features:
- company type
- major discipline
- gender


In [None]:
gender_dummies = pd.get_dummies(df_train.gender)
gender_dummies.columns = ["gender_{}".format(i) for i in gender_dummies.columns]
df_train = pd.concat([df_train, gender_dummies], axis=1)

major_discipline_dummies = pd.get_dummies(df_train.major_discipline)
major_discipline_dummies.columns = ["major_discipl_{}".format(i) for i in major_discipline_dummies.columns]
df_train = pd.concat([df_train, major_discipline_dummies], axis=1)

company_type_dummies = pd.get_dummies(df_train.company_type)
company_type_dummies.columns = ["company_type_{}".format(i) for i in company_type_dummies.columns]
df_train = pd.concat([df_train, company_type_dummies], axis=1)

df_train.drop(['gender', "company_type", "major_discipline"], axis=1, inplace=True)
df_train.head()

## We'll define the training and validation dataset:

In [None]:
X = df_train.drop(['enrollee_id', "target"], axis=1)
y = df_train['target']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.33, stratify=y)

print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

### Considering that the dataset is a dichotomous variable matrix, we cannot apply linear models because the assumptions will not be respected.
### Usually the models that work best with this type of matrix are trees and forests. 
### I usually prefer trees because they are easy to represent to the general manager.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score, classification_report

In [None]:
model_tree = DecisionTreeClassifier(random_state=0)
model_tree.fit(X_train, y_train)

### I use the technique of cost complexity pruning to prune the tree. This technique allows you to prune the tree according to the importance of the nodes.

In [None]:
path = model_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
## Show level of imputiry correlate of alpha prining:
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.plot(ccp_alphas[-1], impurities[-1], marker='o', drawstyle="steps-post", color="red", label="Albero con un solo nodo")
plt.legend()
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
# We estimate a model for each alpha parameters:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
      clfs[-1].tree_.node_count, ccp_alphas[-1]))

In [None]:
 # show when decrease the nodes/depth as the parameter increases

clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(10, 7))
ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
plt.plot()

In [None]:
# I choose best alpha model based on the best F1 score: 
train_scores = [f1_score(y_train, clf.predict(X_train), average='macro') for clf in clfs]
test_scores = [f1_score(y_valid, clf.predict(X_valid), average='macro') for clf in clfs]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("F1")
ax.set_title("F1 vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()
print("Max F1: {}. Index: {}".format(round(max(test_scores),4), test_scores.index(max(test_scores))))

In [None]:
# I choose best alpha model based on the best Accuracy score: 
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_valid, y_valid) for clf in clfs]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()
print("Max F1: {}. Index: {}".format(round(max(test_scores),4), test_scores.index(max(test_scores))))

In [None]:
model_bestF1 = clfs[996]
model_bestAcu = clfs[986]

## I prefer best F1 Score model (model_bestF1).

In [None]:
# risultati modelllo 923 (F1 Score):
yh_train_bestF1 = model_bestF1.predict(X_train)
yh_test_bestF1 = model_bestF1.predict(X_valid)

print("---"*30, "\nTRAINING DATA:\n")
print("Confusion Matrix (train data):\n{}\n".format(confusion_matrix(y_train, yh_train_bestF1)))
print("Classification Report (train data):\n{}\n".format(classification_report(y_train, yh_train_bestF1)))

print("---"*30, "\nVALIDATION DATA:\n")
print("Confusion Matrix (valid data):\n{}\n".format(confusion_matrix(y_valid, yh_test_bestF1)))
print("Classification Report (valid data):\n{}\n".format(classification_report(y_valid, yh_test_bestF1)))

## the model selected through cost complexity pruning, guarantees us an accuracy of 80%, a weighted accuracy of 74%, a recall of 75% and an F1 Score of 74%. The most important note is that the model thus selected does not suffer from overfitting, so it is able to generalize well.

## I show the most significant variables:

In [None]:
# get importance
importance = model_bestF1.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.figure(figsize=(7,10))
plt.barh([X.columns[x] for x in range(len(importance))], importance)
plt.show()

## Features such as city_development_index, company_size and education_level are the most important to help the model classify.

## Tree rappresentation:

In [None]:
from sklearn import tree

In [None]:
plt.figure(figsize=(20,15))
tree.plot_tree(model_bestF1)
plt.show()

# Defining my pipeline encoder:

In [None]:
def encodeGendere(X):
    X['gender'] = X['gender'].fillna("Unknown_gender")
    gender_dummies = pd.get_dummies(X["gender"])
    gender_dummies.columns = ["gender_{}".format(i) for i in gender_dummies.columns]
    X = pd.concat([X, gender_dummies], axis=1)
    X.drop(['gender'], axis=1, inplace=True)
    return X

def encodeRelevent_Experience(X):
    X["relevent_experience"] = X["relevent_experience"].map({"Has relevent experience":1., "No relevent experience":0.,})

def encodeEnrolled_university(X):
    X["enrolled_university"] = X["enrolled_university"].fillna("Unknown")
    X["enrolled_university"] = X["enrolled_university"].map({"Unknown":1., "Part time course":2., "Full time course":3., "no_enrollment":0.})

def encodeEducation_level(X):
    X["education_level"] = X["education_level"].fillna("Unknown")
    X["education_level"] = X["education_level"].map({"Unknown":0., "Primary School":1., "High School":2., "Graduate":3., "Masters":4., "Phd":5.})
    
def encodeMajor_discipline(X):
    X["major_discipline"] = X["major_discipline"].fillna("Unknown_discipline")
    major_discipline_dummies = pd.get_dummies(X["major_discipline"])
    major_discipline_dummies.columns = ["major_discipl_{}".format(i) for i in major_discipline_dummies.columns]
    X = pd.concat([X, major_discipline_dummies], axis=1)
    X.drop(["major_discipline"], axis=1, inplace=True)
    return X

    
def encodeExperience(X):
    array_experience = []
    for i in X["experience"]:
        if i == ">20": array_experience.append(21.)
        elif i == "<1": array_experience.append(0.)
        else: 
            try: array_experience.append(float(int(i)))
            except: array_experience.append(i)

    X["experience"] = array_experience
    X["experience"].fillna(-1., inplace=True)

def encodeCompany_size(X):
    X["company_size"].fillna("Unknown", inplace=True)
    X["company_size"] = X["company_size"].map({"Unknown": 0., "<10": 1., "10/49":2., "50-99": 3., "100-500":4. , "500-999":5., "1000-4999":6., "5000-9999": 7., "10000+": 8.})

def encodeCompany_type(X):
    X["company_type"].fillna("Unknown_company_type", inplace=True)
    company_type_dummies = pd.get_dummies(X["company_type"])
    company_type_dummies.columns = ["company_type_{}".format(i) for i in company_type_dummies.columns]
    X = pd.concat([X, company_type_dummies], axis=1)
    X.drop(["company_type"], axis=1, inplace=True)
    return X
    
def encodeLast_new_job(X):
    X["last_new_job"].fillna("never", inplace=True)
    X["last_new_job"] = X["last_new_job"].map({"4":4., "3":3., "2":2., "1":1., ">4":5., "never":0.})

def encodeCity(X):
    X['city'] = [int(i[1]) for i in X["city"].str.split("_")]

def MyPipiline_encoder(X):
    X = encodeGendere(X)
    encodeRelevent_Experience(X)
    encodeEnrolled_university(X)
    encodeEducation_level(X)
    X = encodeMajor_discipline(X)
    encodeExperience(X)
    encodeCompany_size(X)
    X = encodeCompany_type(X)
    encodeLast_new_job(X)
    encodeCity(X)
    return X.drop(['enrollee_id'], axis=1), X.enrollee_id

# Predict into Test set:

In [None]:
df_test = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")
df_test.head()

## Encoding:

In [None]:
X_test, enrollee_id = MyPipiline_encoder(df_test)

In [None]:
X_test.head()

In [None]:
enrollee_id.head()

## Predict:

In [None]:
submission_df = pd.DataFrame({"enrollee_id": enrollee_id,
                             "target":model_bestF1.predict(X_test)})
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=False, sep=";")