#IBM Employee Attrition Analysis by Category

----------
## Set Up Dataset

In [None]:
from pandas import read_csv
data = read_csv("../input/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
target = "Attrition"

In [None]:
feature_by_dtype = {}
for c in data.columns:
    
    if c == target: continue
    
    data_type = str(data[c].dtype)
    
    if data_type not in feature_by_dtype.keys():
         feature_by_dtype[data_type] = [c]
    else:
        feature_by_dtype[data_type].append(c)

In [None]:
objects = feature_by_dtype["object"]

In [None]:
remove = ["Over18"]

In [None]:
categorical_features = [f for f in objects if f not in remove]

In [None]:
int64s = feature_by_dtype["int64"]

In [None]:
remove.append("StandardHours")
remove.append("EmployeeCount")

In [None]:
count_features = []
for i in [i for i in int64s if len(data[i].unique()) < 10 and i not in remove]:
    count_features.append(i)

In [None]:
#count_features = count_features + ["TotalWorkingYears", "YearsAtCompany", "HourlyRate"]

In [None]:
data[count_features].head()

In [None]:
data[categorical_features].head()

----------


# Chi-square 2 Way 

In [None]:
from scipy.stats import chi2_contingency
from pandas import crosstab, DataFrame

p_value_table = DataFrame(index = [target], columns = (categorical_features+count_features))

for c in (categorical_features+count_features):

    crosstable = crosstab(data[c], data[target])
    chi2, p, dof, expected = chi2_contingency(crosstable)
    p_value_table[c][target] = p

p_value_table = p_value_table.T
p_value_table["p < 0.05"] = p_value_table.apply(lambda x : x < 0.05)

In [None]:
p_value_table.sort_values("Attrition", ascending=False)

Attrition's Non-Significant Categorical Relationships

In [None]:
ns = p_value_table[p_value_table["p < 0.05"] == False].index.tolist()
print(ns)

Attrition's Statistically Significant Categorical Relationships

In [None]:
significant = p_value_table[p_value_table["p < 0.05"] == True].index.tolist()
print(significant)

----------
# Count Plots

In [None]:
def percentages(data,category, filter_):
    output = {}
    total_count = data[filter_][category].value_counts().sum()
    for subclass in data[filter_][category].unique():
        subclass_count = data[filter_][category].value_counts()[subclass]
        output[subclass] = (subclass_count / total_count) * 100
    return output

from IPython.display import display
from pandas import DataFrame, options

def display_percentages(data,category, filter_):
    perc = percentages(data,category, filter_)
    df = DataFrame(perc, index=["Percent"]).T.sort_values("Percent", ascending=False)
    df["Cumulative Percent"] = [df["Percent"][0:i].sum() for i in range(1,len(df)+1)]
    options.display.float_format = '{:,.1f}%'.format
    print("Yes Only")
    print("Total Count: %s" % len(data[filter_]))
    display(df)
    
#====

from seaborn import countplot, despine, axes_style, set_style
from matplotlib.pyplot import show,figure,subplot,xticks,suptitle,title, ylabel, xlabel, margins
from numpy import mean

def display_categorical_x_categorical_analysis(data,category):

    set_style("whitegrid")

    with axes_style({'grid.color': "0.95", "lines.color" : "0.95"}):

        c = category

        order = data[data[target] == "Yes"][c].value_counts().sort_values(ascending=False).index

        fig = figure(figsize=(12,6))
        suptitle(c, fontsize=16)
        margins(0.8)
        subplot(121)
        title("Yes Only")
        cp = countplot(x=c, data=data[data[target] == "Yes"], order=order, color="#121831", linewidth=0)
        despine(left=True, top=True)

        xlabel_char_length = int(mean([len(str(i)) for i in data[c].unique()]))

        if(xlabel_char_length in range(7, 15)): 
            xticks(rotation=45)
        elif(xlabel_char_length > 14):
            xticks(rotation=90)

        subplot(122)
        title("Yes vs No")
        cp = countplot(x=c, hue=target, data=data, order=order, palette=["#121831", "#d4e2ed"], linewidth=0)
        despine(left=True, top=True)
        if(xlabel_char_length in range(7, 15)): 
            xticks(rotation=45)
        elif(xlabel_char_length > 14):
            xticks(rotation=90)
        xlabel(c)
        show()

        display_percentages(data,c,data[target] == "Yes")

In [None]:
i = iter(significant)

# Majority of employees lost in attrition rarely travel.

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# Research & Development and Sales department contribute ~95% of the employees lost in attrition

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# Employees educated in life sciences or medical together make up ~64% of the attrition sample.

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# 70% of attrition sample is made up from laboratory technicians,  sale executives, and research scientists.

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# Employees with the relationship status as single make up over 50% attrition sample.

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# The ratio of employees working overtime is drastically different across Yes vs No samples. 

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# The distribution of environment satisfaction ratings are different for employees lost in attrition vs those who stayed. 

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# ~80% of employees lost in attrition rated their level job involvement as moderate to moderately high.

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# Entry level employees make up 60% of the attrition sample

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# Employees who stay had job satisfactions ratings ratio more positively biased than those lost in attrition.

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# Employees lost in attrition had 65% of its members possessing stock option level at 0.

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# Employees lost in attrition had ~70% of members trained 2 - 3 times a year. 

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

# ~78% of employees lost in attrition rated their worklife balance as moderate to moderately high.

In [None]:
display_categorical_x_categorical_analysis(data,next(i))

----------

### Read more:
- [IBM Employee Attrition Analysis by Numerics][1]
  [1]: https://www.kaggle.com/slamnz/d/pavansubhasht/ibm-hr-analytics-attrition-dataset/ibm-employee-attrition-analysis-by-numerics/