# Why Employees Cause Attrition?

## This analysis consists of three parts.
1. Expore data distribution.
3. Predict Attrition using machine learning methods.

In [None]:
# Import basic packages
import pandas as pd
import numpy as np
pd.options.display.max_columns = None

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
# Output plots in notebook
% matplotlib inline
% config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# 1. Expore data distribution

In [None]:
Data = pd.read_csv("../input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
Data.head()

#### Target  is "Attrition".

In [None]:
Data["Attrition"].value_counts()

In [None]:
sns.set(style="whitegrid", font_scale=1.3)
sns.countplot(x="Attrition", data=Data, palette="hls")
sns.plt.title("Attrition Counts")

#### The employees who cause Attrition are 200 / 1400 of the whole.

### First of all, check the distribution of data.

In [None]:
Data.info()

#### As there are numeric type attributes and character string attributes, we should separate them.

In [None]:
# columns name list
cols = Data.columns
num_cols = Data._get_numeric_data().columns
cat_cols = cols.drop(num_cols.tolist())

### Numeric type attributes

In [None]:
print(num_cols)

In [None]:
Data[num_cols].describe()

#### EmployeeCount, EmployeeNumber and StandardHours don't make sense. So we delete them.

In [None]:
Data.drop(["EmployeeCount", "EmployeeNumber", "StandardHours"], axis=1, inplace=True)

### Character string attributes

#### check each category values

In [None]:
for cat_col in cat_cols:
    display(Data[cat_col].value_counts())

* "Attrition", "Gender" and "OverTime" are binary variables. We convert them into 0 or 1 numerical data by using dummy variables.
* "MaritalStatus" is three value variable. We classify it as "Married" and others, and convert it into 0 or 1 numerical data.
* "BusinessTravel" represent the frequency of three stages. We convert it into numerical ordinal data.
* "Department" and "JobRole" are similar variables. We unify them to "Department_JobRole" variable.
* The value of "Over18" is all "Y". As it doesn't make sense, we delete it.

In [None]:
# make Business_Travel
Data["Business_Travel"] = Data["BusinessTravel"].map({"Non-Travel":0, "Travel_Rarely":1, "Travel_Frequently":2})

# make Dapartment_JobRole
Data["Department_JobRole"] = Data["Department"] + " : " + Data["JobRole"]
# make binary data
Data["MaritalStatus_Married"] = pd.get_dummies(Data["MaritalStatus"])["Married"]
Data = pd.concat([Data, pd.get_dummies(Data[["Gender", "OverTime", "Attrition"]], drop_first=True)], axis=1)

# drop 
Data.drop(["BusinessTravel", "Department", "JobRole", "MaritalStatus", "Gender", "OverTime", "Attrition", "Over18"], axis=1, inplace=True)

#### Again, classify each attribute.

In [None]:
Data.head()

In [None]:
cols = Data.columns
num_cols = Data._get_numeric_data().columns
cat_cols = cols.drop(num_cols.tolist())

In [None]:
print("Numeric data\n", num_cols)
print("Categorical data\n", cat_cols)

### Data conversion has been completed. We observe the correlation coefficient of each numerical data.

#### Checking the correlation coefficient, we can find what kind of variables are related deeply.

In [None]:
sns.set(style="whitegrid", font_scale=0.8)
plt.figure(figsize=(13,13))
corr = round(Data.corr(),2)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, annot=True, cmap="RdBu", mask=mask, )
plt.title("Correlation between features", fontdict={"fontsize":20})

### As there are many variables, we extract and visualize only strong correlation coefficient attributes.

In [None]:
extract_cols  = ["Age", "JobLevel", "MonthlyIncome", "PercentSalaryHike", "PerformanceRating", "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole","YearsSinceLastPromotion", "OverTime_Yes", "Attrition_Yes"]
sns.set(style="whitegrid", font_scale=1.2)
plt.figure(figsize=(10,7))
corr = round(Data[extract_cols].corr(),2)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, annot=True, cmap="RdBu", vmin=-1, vmax=1, mask=mask)
plt.title("Correlation between features / important features", fontdict={"fontsize":20})

### The employee whose Age, JobLevel, MonthlyIncome, TotalWorkingYears, YearsAtCopany and YearsCurrentRole are low tend to cause Attrition.
* Young and low income employee
* As "OverTime_Yes" is positive correlation, busy employees tend to cause Attrition.

### Long-time employee has high JobLevel and MonthlyIncome.
* On the other hand, "PerformanceRating" is not related with JobLevel and MonthlyIncome.
* Althrough employees would be evaluated by contents of work, MonthlyIncome is correlated with working years.
* So young employees at this company tend to cause Attrition.

### Next we observe the differience of data disribution in attrition_yes or not.

#### standarize data to comapre easily.

In [None]:
Data_copy = Data.copy()
scale_cols = Data_copy.columns.drop(["Department_JobRole", "EducationField", "Attrition_Yes"])
Data_copy[scale_cols] = (Data_copy[scale_cols] - Data_copy[scale_cols].mean()) / Data_copy[scale_cols].std()

#### check how variables are dispersed by Attrition_Yes

In [None]:
Att = Data_copy.groupby(["Attrition_Yes"], as_index=False).mean().transpose()
Att.head()

#### make pandas frame to plot using seaborn library

In [None]:
Att_sep = [Att[[x]] for x in range(len(Att.columns))]
Att_plot = pd.DataFrame([], columns=["mean", "feature","kind"])
for (i,data) in enumerate(Att_sep):
    data["feature"] = data.index
    data["kind"] = "Attrition_Yes_" + data.loc["Attrition_Yes"].astype(str).values[0]
    data.rename(columns={i:"mean"}, inplace=True)
    data.drop(["Attrition_Yes"], axis=0, inplace=True)
    Att_plot = pd.concat([Att_plot, data], axis=0)

#### prepare some clusters to comapre easily

In [None]:
# organize features
features_1 = ["Age","Gender_Male","MaritalStatus_Married","Education","DistanceFromHome","NumCompaniesWorked"]
features_2 = ["JobInvolvement","JobLevel","JobSatisfaction","EnvironmentSatisfaction",
              "RelationshipSatisfaction", "WorkLifeBalance", "Business_Travel", "OverTime_Yes",]
features_3 = ["HourlyRate","DailyRate","MonthlyRate","MonthlyIncome",
              "PercentSalaryHike","StockOptionLevel","PerformanceRating"]
features_4 = ["TotalWorkingYears","YearsAtCompany","YearsInCurrentRole","YearsWithCurrManager","YearsSinceLastPromotion","TrainingTimesLastYear"]
# make pandas frame for plot
features = [features_1, features_2, features_3, features_4]
Att_plot = Att_plot.loc[features_1 + features_2 + features_3 + features_4]

#### plot function

In [None]:
def feature_plot(input_data, title, palette="hls", size=4, aspect=3, rotation=0, ylim=None):
    ax = sns.factorplot(x="feature", y="mean", hue="kind", data=input_data, palette=palette, size=size, aspect=aspect)
    ax.set(xlabel="", ylim=ylim)
    ax.set_xticklabels(rotation=rotation)
    plt.title(title, fontdict={"fontsize":17})
    sns.despine(left=True, bottom=True)

#### plot graph

In [None]:
sns.set(style="whitegrid", font_scale=1.2)
feature_plot(input_data=Att_plot, palette=sns.color_palette("hls",2)[::-1], title="mean distribution / features", size=6, aspect=2, rotation=90, ylim=(-1,1))

### extract high correlation feature with Attrition_Yes

In [None]:
important_cols = ["Age", "JobInvolvement", "JobLevel", "MonthlyIncome", "StockOptionLevel", "TotalWorkingYears", "YearsAtCompany", "YearsInCurrManager", "Business_Travel", "OverTime_Yes"]
feature_plot(input_data=Att_plot.loc[important_cols], palette=sns.color_palette("hls",2)[::-1], title="mean distribution / important features", size=4, aspect=3, rotation=90, ylim=(-1,1))

#### The most different feature is "OverTime_Yes"
* High value of "Business_Travel" and "OverTime_Yes" cause Attrition.

#### Employees who get attrition are young and not long-time employees.
* "JobLevel", "JobInvolvment" and "JobSatisfaction" are low.
* They are not matured to job. 

#### Monthly income of employee who get attrition are low.
* "MonthlyIncome" and "StockOptionLevel" are low.

### Next we observe distribution by separating categorical variables.

In [None]:
print(cat_cols)

# Education Field

In [None]:
Data_copy["EducationField"].value_counts()

In [None]:
# replace long word to short word.
Data_copy["EducationField"].replace({"Life Sciences":"LifeSc", "Technical Degree":"Technical", "Human Resources":"HR"}, inplace=True)

#### First, check propotion of Attrition in each EducationField

In [None]:
ax = sns.barplot(x="EducationField", y="Attrition_Yes", data=Data_copy, palette="hls")
ax.set_ylabel("Propotion")
sns.plt.title("Propotion of Attrition_Yes / EducationField")

#### The employee who study HR, Technical Degree and Marketing tend to get Attrition.
* HR is high standard deviation as there are only 27 employees.

#### The employees of Life Sciences and Medical are large in the whole. But they don't tend to get Attrition.

### check mean distibution of each EducationFiled.

In [None]:
Education_Att = Data_copy.groupby(["EducationField", "Attrition_Yes"], as_index=False).mean()
Education_Att = Education_Att[Education_Att["Attrition_Yes"] == 1].transpose()
Education_Att.head()

In [None]:
Education_sep = [Education_Att[[x]] for x in Education_Att.columns]
Education_plot = pd.DataFrame([], columns=["mean", "feature", "kind"])
for (col,data) in zip(Education_Att.columns, Education_sep):
    data["feature"] = data.index
    data["kind"] = data.loc["EducationField"].values[0] + "_" + data.loc["Attrition_Yes"].astype(str).values[0]
    data.rename(columns={col:"mean"}, inplace=True)
    data.drop(["EducationField", "Attrition_Yes"], axis=0, inplace=True)
    Education_plot = pd.concat([Education_plot, data], axis=0)
Education_plots = [Education_plot.loc[x] for x in features]

In [None]:
graph_titles = ["Personal Information / EducationField", "Job Information / EducationField", "Evaluation / EducationField", "Working History / EducationField"]
for (i,title) in enumerate(graph_titles):
    feature_plot(input_data=Education_plots[i], title=title, rotation=15, ylim=(-1,1.2))

In [None]:
important_cols = ["Age", "JobInvolvement", "JobLevel", "MonthlyIncome", "StockOptionLevel", "TotalWorkingYears", "YearsAtCompany", "YearsInCurrManager", "Business_Travel", "OverTime_Yes"]
feature_plot(input_data=Education_plot.loc[important_cols], title="Important features / EducationField", size=4, aspect=3, rotation=15, ylim=(-1,1.2))

#### HR
* Working years variable is lower than other attributes.
* As there is High correlation between Working years and Income, their MonthlyIncome is low too.
* In addition, their Business_Travel is high. So they tend to get Attrition.

#### Technical Degree
* There are no notable attributes. But the mean value of important attributes are low on the whole.

#### Marketing
* JobLevel and MonghlyIncome are high mean. But OverTime_Yes is higher than other EducationFiled.

#### LifeScience, Medical
* Their mean value of important attributes are a little high on the whole.

# Department_JobRole

In [None]:
Data_copy["Department_JobRole"].value_counts()

In [None]:
# replace long word to short word.
Data_copy["Department_JobRole"].replace({"Sales : Sales Executive":"Sales : Executive",
                              "Sales : Sales Representative":"Sales : Representative",
                              "Sales : Manager":"Sales : Manager",
                              "Research & Development : Research Scientist":"R&D : RS",
                              "Research & Development : Laboratory Technician":"R&D : Lab",
                              "Research & Development : Manufacturing Director":"R&D : MD",
                              "Research & Development : Healthcare Representative":"R&D : Health",
                              "Research & Development : Research Director":"R&D : RD",
                              "Research & Development : Manager":"R&D : Manager",
                              "Human Resources : Human Resources":"HR : HR",
                              "Human Resources : Manager":"HR : Manager"}, inplace=True)

#### First, check propotion of Attrition in each JobRole

In [None]:
sns.set(style="whitegrid", font_scale=1.1)
ax = sns.factorplot(x="Department_JobRole", y="Attrition_Yes", kind="bar", data=Data_copy, size=4, aspect=3, palette="hls")
ax.set(xlabel="", ylabel="Propotion")
ax.set_xticklabels(rotation=15)
plt.title("Propotion of Attrition_Yes / Job Role", fontdict={"fontsize":16})

#### Sales
* Propotion of representative is vert high. The propotion is smaller in order from manager, executive and representative.

#### R&D
* Lab is highest, and RS follow. Other jobroles are less likely to cause Attrition.

#### HR
* All 11 Manager don't get attrition. Propotion of other 52 HR is high.

#### Managing posts are not likely to cause Attrition. Young employees's post are likely to cause it.

In [None]:
JobRole_Att = Data_copy.groupby(["Department_JobRole", "Attrition_Yes"], as_index=False).mean()
JobRole_Att = JobRole_Att[JobRole_Att["Attrition_Yes"] == 1].transpose()
JobRole_Att.head()

In [None]:
JobRole_sep = [JobRole_Att[[x]] for x in JobRole_Att.columns]
JobRole_plot = pd.DataFrame([], columns=["mean", "feature", "kind"])
for (col,data) in zip(JobRole_Att.columns, JobRole_sep):
    data["feature"] = data.index
    data["kind"] = data.loc["Department_JobRole"].values[0] + "_" + data.loc["Attrition_Yes"].astype(str).values[0]
    data.rename(columns={col:"mean"}, inplace=True)
    data.drop(["Department_JobRole", "Attrition_Yes"], axis=0, inplace=True)
    JobRole_plot = pd.concat([JobRole_plot, data], axis=0)
JobRole_plots = [JobRole_plot.loc[x] for x in features]

In [None]:
# make color palette
HR_color = sns.color_palette("Reds",1)
RD_color = sns.color_palette("Blues", 10)
RD_color_sorted = [RD_color[1], RD_color[8], RD_color[2], RD_color[5], RD_color[4], RD_color[9]]
Sales_color = sns.color_palette("Greens", 3)
job_color = HR_color + RD_color_sorted + Sales_color
sns.palplot(job_color)

In [None]:
graph_titles = ["Personal Information / JobRole", "Job Information / JobRole", "Evaluation / JobRole", "Working History / JobRole"]
for (i,title) in enumerate(graph_titles):
    feature_plot(input_data=JobRole_plots[i], palette=job_color, title=title, size=4.5, aspect=2.5,rotation=15)

In [None]:
important_cols = ["Age", "JobInvolvement", "JobLevel", "MonthlyIncome", "StockOptionLevel", "TotalWorkingYears", "YearsAtCompany", "YearsInCurrManager", "Business_Travel", "OverTime_Yes"]
feature_plot(input_data=JobRole_plot.loc[important_cols], palette=job_color, title="important features / EducationField", size=4.5, aspect=2.5, rotation=15)

#### Sales
* They are young in order of representation, executive, manager.
* Representative's JobLevel and MonthlyIncome are low.
* Manager's JobLevel is very high. But JobInvovlement is very low.
* Managers work at this company very long.

#### R&D
* As Research Director and Manager are managing post, their age and their skill to job are high.
* Laboratory's and Research Scientist's JobLevel and MonthlyIncome are low. Also their working years are low too.

#### HR
* Their working years and JobLevel and MonthlyIncome are low.

### The difference of JobRole is bigger than EducationField.
* Looking at JobLevel and MonghlyIncome, Sales Representative and Research Scientists who are likely to cause Attrition have low values which are lower 1.0 point than mean. But Sales Manager, Research Director and Research Manager have high values which are higher 1.0 point than mean.
* Also there are JobRole many young employees work.
* On the other hand, PerformanceRating between JobRole is not related. So young employees may have a burdens.

# Summary
* The employees who likey to cause Attrition are young.
* Department_JobRole is more related to Attrition than EducationFiled.
* Mangement posts are not likely to cause Attrition. Lower-level positions are likely to cause Attrition.
* OverTime and BusinessTravle is important attributes which lead to cause Attrition.
* As young employees are not mature to job, this is happend.

# 2. Predict Attrition using machine learning methods.

#### First, we convert "EducationFiled" and "Department_JobRole" into dummy variables.

In [None]:
from sklearn.cross_validation import cross_val_predict, KFold
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [None]:
Data.head()

In [None]:
Data = pd.concat([Data, pd.get_dummies(Data[["EducationField", "Department_JobRole"]])], axis=1)
Data.drop(["EducationField", "Department_JobRole"], axis=1, inplace=True)
Data.head()

In [None]:
cols = Data.columns.drop("Attrition_Yes")
features = Data[cols]
target = Data[["Attrition_Yes"]]

In [None]:
kf = KFold(features.shape[0], random_state=1, n_folds=10)

In [None]:
target = target.values.ravel()
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=1)

# We predict by LogisticRegression, RandomForest and SVM.

In [None]:
# define score function
def print_clf_score(input_predictions):
    pd.DataFrame(confusion_matrix(Data["Attrition_Yes"], input_predictions), index=["true_0", "true_1"], columns=["pred_0","pred_1"])
    print(classification_report(Data["Attrition_Yes"], input_predictions))
    print("accuracy: ", accuracy_score(Data["Attrition_Yes"], input_predictions))
    print("f1_score: ", f1_score(Data["Attrition_Yes"], input_predictions))
    print("roc_auc: ", roc_auc_score(Data["Attrition_Yes"], input_predictions))

### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
predictions = cross_val_predict(lr, features, target, cv=kf)
print_clf_score(pd.Series(predictions))

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr_coef = pd.DataFrame(lr.coef_, columns=X_train.columns, index=["feature"]).transpose()
lr_coef.reindex([lr_coef["feature"].abs().sort_values(ascending=False).index]).head(10)

### RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, random_state=1, n_jobs=5)
predictions1 = cross_val_predict(rf, features, target, cv=kf)
print_clf_score(pd.Series(predictions1))

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf_imp = pd.DataFrame(rf.feature_importances_, index=X_train.columns, columns=["feature"])
rf_imp.reindex([rf_imp["feature"].abs().sort_values(ascending=False).index]).head(10)

### SVM RBF

#### First, standadize the data.

In [None]:
features_copy = features.copy()

In [None]:
ordinal_cols = features_copy.columns[:24]
features_copy[ordinal_cols] = (features_copy[ordinal_cols] - features_copy[ordinal_cols].mean()) / features_copy[ordinal_cols].std()

In [None]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(features_copy, target, test_size=0.1, random_state=1)

#### gridsearch

In [None]:
tuned_parameters = {
 'C': [pow(2, x) for x in range(-5, 16)] , 'gamma': [pow(2, x) for x in range(-15, 4)], 'kernel': ['rbf']
                   }

In [None]:
from sklearn import svm
from sklearn.grid_search import GridSearchCV
svc = svm.SVC()
model = GridSearchCV(svc, tuned_parameters, cv=10, scoring="f1", n_jobs=8)
model.fit(X_train_c, y_train_c)
print("model.best_score_", model.best_score_)
print("model.best_params_", model.best_params_)

#### 10-fold cross validation using best parameters.

In [None]:
svc = svm.SVC(kernel="rbf", C=256, gamma=pow(2,-11))
predictions2 = cross_val_predict(svc, features_copy, target, cv=kf)
print_clf_score(pd.Series(predictions2))

# Conclusion

We can predict the employee who will cause Attrition in 89% accuracy.