In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
df = pd.read_csv("HR-Employee-Attrition.csv")
df = df.dropna()
df = df.drop_duplicates()

print(df.head())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

           ...           RelationshipSatisfaction StandardHours  \
0      

# Feature 1

In [17]:
# I made this new feature column using the variable "BusinessTravel", by converting its categorical values into the digists
# 0 and 1.  This would be better for a statistical model that we may create in the future.

df['Travelreq'] = df["BusinessTravel"].map({'Travel_Frequently':1, "Travel_Rarely":0})
df.Travelreq = df.Travelreq.astype('int32', errors="ignore")




# Feature 2

In [18]:
# This feature column divides the "YearsAtCompany" column by the "TotalWorkingYears" column to give us the percentage of each 
# employee's total working years that were with the company.

df['percentofyearswithcompany'] = (df['YearsAtCompany'] / df['TotalWorkingYears']) * 100
df.percentofyearswithcompany = df.percentofyearswithcompany.round()


# Feature 3

In [19]:
# New feature showing us those employees that are above the average age of all employees.
age_mean = df.Age.mean()
df['AgeAboveaverage'] = np.where(df['Age'] > age_mean, 1, 0)




# Feature 4

In [20]:
print(df['JobRole'].value_counts())

Sales Executive              326
Research Scientist           292
Laboratory Technician        259
Manufacturing Director       145
Healthcare Representative    131
Manager                      102
Sales Representative          83
Research Director             80
Human Resources               52
Name: JobRole, dtype: int64


In [21]:
#created a new feature from the categorical variable "JobsRole", and turning it into a numerica column that 
#ranked each job position on a numbering scale of 1-3, 1 being the top of the rank. Something like this could be used in the 
#future for a model that had to compare salaries of different positions.

df['JobsCategory'] = df["JobRole"].map({'Sales Executive':2, "Research Scientist":1, "Laboratory Technician":3, "Manufacturing Director":1, "Healthcare Representative":3, "Manager":2, "Sales Representative":3, "Research Director":1, "Human Resources":2})



# Feature 5

In [22]:
# New feature column that selects those employees that are male, and assigns them 1, otherwise assigns 0.

df['Gendermale'] = df["Gender"].map({'Male':1, "Female":0})

In [23]:
print(df.head())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

      ...      WorkLifeBalance YearsAtCompany  YearsInCurrentRole  \
0    