# Data Exploration

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

Reading The Data....

In [2]:
data = pd.read_csv('C://Users//SANDEEP YADAV//Desktop//Major2Poject//employee_data.csv', delimiter=',')

In [3]:
print(data.shape)

(14999, 12)


Finding Missing Values:

In [4]:
data.apply(lambda x: sum(x.isnull()))

name                      1
satisfaction_level        0
last_evaluation           0
number_projects           0
average_monthly_hours     0
time_spent_company        0
work_accident             0
left                      0
promotion_last_5_years    0
department                0
salary                    0
salary_level              0
dtype: int64

In [5]:
data.head()

Unnamed: 0,name,satisfaction_level,last_evaluation,number_projects,average_monthly_hours,time_spent_company,work_accident,left,promotion_last_5_years,department,salary,salary_level
0,SMITH,0.38,0.53,2,157,3,0,1,0,sales,low,1
1,JOHNSON,0.8,0.86,5,262,6,0,1,0,sales,medium,2
2,WILLIAMS,0.11,0.88,7,272,4,0,1,0,sales,medium,2
3,BROWN,0.72,0.87,5,223,5,0,1,0,sales,low,1
4,JONES,0.37,0.52,2,159,3,0,1,0,sales,low,1


One thing can be infered from above that salary and salary_level represents the same thing beacuse :

* low=1
* medium=2
* high=3

In [6]:
data.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_projects,average_monthly_hours,time_spent_company,work_accident,left,promotion_last_5_years,salary_level
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268,1.594706
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281,0.637183
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0,1.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0,1.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0,2.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0,2.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0,3.0


From this the following things can be analysed :

1.The avg satisfaction_level is > 0.6 - this clearly states that it is a not a major factor and along with it there are other factors too which are affecting the employees i.e last_evaluation whose avg > 0.7.

2.The employee is having of nearly 2 years of experience in the company and gets avg salary.

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 12 columns):
name                      14998 non-null object
satisfaction_level        14999 non-null float64
last_evaluation           14999 non-null float64
number_projects           14999 non-null int64
average_monthly_hours     14999 non-null int64
time_spent_company        14999 non-null int64
work_accident             14999 non-null int64
left                      14999 non-null int64
promotion_last_5_years    14999 non-null int64
department                14999 non-null object
salary                    14999 non-null object
salary_level              14999 non-null int64
dtypes: float64(2), int64(7), object(3)
memory usage: 1.4+ MB


In [8]:
print(data[data['name']==""].info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 12 columns):
name                      0 non-null object
satisfaction_level        0 non-null float64
last_evaluation           0 non-null float64
number_projects           0 non-null int64
average_monthly_hours     0 non-null int64
time_spent_company        0 non-null int64
work_accident             0 non-null int64
left                      0 non-null int64
promotion_last_5_years    0 non-null int64
department                0 non-null object
salary                    0 non-null object
salary_level              0 non-null int64
dtypes: float64(2), int64(7), object(3)
memory usage: 0.0+ bytes
None


Finding and filtering categorical variables:

In [9]:
categorical = data.dtypes[data.dtypes=="object"].index

print(categorical)

Index(['name', 'department', 'salary'], dtype='object')


Looking at the number of unique values in each of them:

In [10]:
data[categorical].apply(lambda x: len(x.unique()))

name          14999
department       10
salary            3
dtype: int64

* This tells us that allthe data is unique.
* There are 10 departments.
* And the salary types is = 3.

In [11]:
data[categorical].describe()

Unnamed: 0,name,department,salary
count,14998,14999,14999
unique,14998,10,3
top,FARMER,sales,low
freq,1,4140,7316


In [12]:
#Excluding Name:

categorical_columns = [x for x in categorical if x not in ['name']]

#Printing Frequency of categories:

for col in categorical_columns:
    print('\nFrequency of Categories for variable :',col)
    print(data[col].value_counts())


Frequency of Categories for variable : department
sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: department, dtype: int64

Frequency of Categories for variable : salary
low       7316
medium    6446
high      1237
Name: salary, dtype: int64


The output gives us following observations:

1.department: We can go for two broad categories of department i.e Technical And Non-Technical.

2.salary: Mojority employees lies in the category of low and med salary type.

But we should check if that’s a good idea before doing it.

# Data Cleaning

Imputing Missing Values : Since there is no missing values so no such requirement is there....

In [13]:
data.apply(lambda x: sum(x.isnull()))

name                      1
satisfaction_level        0
last_evaluation           0
number_projects           0
average_monthly_hours     0
time_spent_company        0
work_accident             0
left                      0
promotion_last_5_years    0
department                0
salary                    0
salary_level              0
dtype: int64

Applying One Hot Encoding on salary and salary_level.

In [14]:
#One Hot Coding:
data = pd.get_dummies(data, columns=['salary','department'])

In [15]:
data.dtypes

name                       object
satisfaction_level        float64
last_evaluation           float64
number_projects             int64
average_monthly_hours       int64
time_spent_company          int64
work_accident               int64
left                        int64
promotion_last_5_years      int64
salary_level                int64
salary_high                 uint8
salary_low                  uint8
salary_medium               uint8
department_IT               uint8
department_RandD            uint8
department_accounting       uint8
department_hr               uint8
department_management       uint8
department_marketing        uint8
department_product_mng      uint8
department_sales            uint8
department_support          uint8
department_technical        uint8
dtype: object

In [16]:
data.head()

Unnamed: 0,name,satisfaction_level,last_evaluation,number_projects,average_monthly_hours,time_spent_company,work_accident,left,promotion_last_5_years,salary_level,...,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
0,SMITH,0.38,0.53,2,157,3,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,JOHNSON,0.8,0.86,5,262,6,0,1,0,2,...,0,0,0,0,0,0,0,1,0,0
2,WILLIAMS,0.11,0.88,7,272,4,0,1,0,2,...,0,0,0,0,0,0,0,1,0,0
3,BROWN,0.72,0.87,5,223,5,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
4,JONES,0.37,0.52,2,159,3,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0


In [17]:
data[['department_IT','department_RandD','department_accounting','department_hr','department_management','department_marketing','department_product_mng','department_sales','department_support','department_technical']].head(10)

Unnamed: 0,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,1,0,0


In [18]:
data[['salary_low','salary_medium','salary_high']].head(10)

Unnamed: 0,salary_low,salary_medium,salary_high
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,1,0,0


Exporting Data...

In [19]:
#Drop the columns which have been converted to different types:
data.head()
#data.drop(['salary_level','name','department'],axis=1,inplace=True)

Unnamed: 0,name,satisfaction_level,last_evaluation,number_projects,average_monthly_hours,time_spent_company,work_accident,left,promotion_last_5_years,salary_level,...,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
0,SMITH,0.38,0.53,2,157,3,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,JOHNSON,0.8,0.86,5,262,6,0,1,0,2,...,0,0,0,0,0,0,0,1,0,0
2,WILLIAMS,0.11,0.88,7,272,4,0,1,0,2,...,0,0,0,0,0,0,0,1,0,0
3,BROWN,0.72,0.87,5,223,5,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
4,JONES,0.37,0.52,2,159,3,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0


In [20]:
data.drop('name',axis=1,inplace=True)

In [46]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_projects,average_monthly_hours,time_spent_company,work_accident,left,promotion_last_5_years,salary_level,salary_high,...,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
0,0.38,0.53,2,157,3,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,2,0,...,0,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,1,0,2,0,...,0,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0


So let’s define who are the best and most experienced employees..
                (above average)

* Last Evaluation >= 0.74
* time_spend_company >= 4
* number_project > 4
* About 1944 people 

In [56]:
good_employee_data = data.loc[data['last_evaluation'] >= 0.74].loc[data['time_spent_company'] >= 4].loc[data['number_projects'] > 4]
good_employee_data.shape

(1944, 22)

Make A File For Good People.....

In [60]:
good_employee_data.to_csv('C://Users//SANDEEP YADAV//Desktop//Major2Poject//good_employee_data.csv')
print('Done.....!!!')

Done.....!!!


In [None]:
df = pd.DataFrame(data)
df.head()

In [None]:
X = df.loc[:, df.columns != 'left']
Y = df.loc[:, df.columns == 'left']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)

In [None]:
clf = RandomForestClassifier()

In [None]:
clf.fit(X_train, Y_train)

In [None]:
pred = clf.predict(X_test)
acc = accuracy_score(Y_test, pred)

print("Accuracy of the algorithm: ", acc)
print("Total number of employees which are likely to leave: ", sum(pred))