In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Purpose

The purpose of this is to determine if we can predict which members of staff will leave the company. 


For this I will prepare and run a logistic regression to see which factors, if any, are significant in employees terminating their contracts.

In [None]:
data = pd.read_csv('/kaggle/input/human-resources-data-set/HRDataset_v13.csv')

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data.tail()

# weirdly there seem to be a lot of null rows.

In [None]:
data.columns

In [None]:
filt = data['Employee_Name'].isnull()
data[filt]

In [None]:
data.drop(data[filt].index, inplace=True)

In [None]:
data.tail()

This is better. I have removed the completely null rows and am ready to begin

In [None]:
data['active'] =  0
data.loc[data['TermReason'] == 'N/A - still employed', 'active'] = 1

# Here I am adding a column which just says if they are active or not.
# This is important for running the regression later

In [None]:
pd.set_option('display.max_columns', None)
data.head()

# I will just take a look at the column names

the Date of Hire variable is a problem. I will convert it into years at the company if possible.

In [None]:
data['DateofHire'] = pd.to_datetime(data['DateofHire'])

In [None]:
data['YearofHire'] = pd.DatetimeIndex(data['DateofHire']).year
data['YearofHire']

I will do a similar thing for date of birth

In [None]:
pd.to_datetime(data['DOB'], infer_datetime_format=True)
data['YearofBirth'] = pd.DatetimeIndex(data['DOB']).year
data['YearofBirth']

# I can see that there is some insanity going on with the year. It has inferred the year in the wrong century.
# I can fix this by subtracting 100 from those dates though

In [None]:
#data['YearofBirth'] = pd.to_numeric(data['YearofBirth'])

In [None]:
filt = data['YearofBirth']>2010
data[filt]

# we can see this affects many rows.

In [None]:
def fix_time(f):
    if f>2010:
        f=f-100
    else:
        f=f
    return f

In [None]:
data['YearofBirth'] = data['YearofBirth'].apply(fix_time)

In [None]:
filt = data['YearofBirth']>2010
data[filt]

# we can see this problem is now fixed.

In [None]:
# Most of the columns have been given IDs already which makes it easier.
# I will drop the columns which already have been given other names. 

data = data.drop(['Employee_Name', 'EmpID', 'PositionID', 'GenderID', 'TermReason', 'DeptID', 'ManagerID', 'TermReason', 'PerfScoreID', 'MaritalStatusID', 'EmpStatusID', 'LastPerformanceReview_Date', 'DateofTermination', 'DateofHire', 'DOB', 'Zip'], axis=1)

In [None]:
x = data['Sex']
plt.hist(x)

In [None]:
z = data['DaysLateLast30']
plt.hist(z)

In [None]:
data['Department'].value_counts()

In [None]:
pay = data['PayRate']
plt.hist(pay)

# we can see a large amount are earning on the lower amounts. A few earning higher and one or two way above the rest.

In [None]:
data['EmploymentStatus'].value_counts()

# it will be interesting to see what the causes were for termination. 

In [None]:
pd.set_option('display.max_columns', None)
data = pd.get_dummies(data, drop_first=True)
data.tail()

In [None]:
y = data['active']
x = data['EmpSatisfaction']

In [None]:
x1 = x.values.reshape(-1,1)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
plt.scatter(x,y)
plt.xlabel('emp-satisfaction')
plt.ylabel('active')

In [None]:
LogReg = LogisticRegression()
LogReg.fit(x1,y)

In [None]:
LogReg.score(x1,y)

In [None]:
LogReg.coef_

In [None]:
pd.set_option('display.max_rows', None)
data.isnull().sum()

In [None]:
data['DaysLateLast30'].value_counts()

In [None]:
# so I have only two problem columns. I can assume that in the column for days late, NaN means 0.

data['DaysLateLast30'] = data['DaysLateLast30'].fillna(0)
data['DaysLateLast30'].value_counts()

In [None]:
data = data.dropna()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data.drop('active', axis=1), data['active'])

In [None]:
from sklearn import preprocessing

In [None]:
normalizer = preprocessing.Normalizer()
x_train_normed = normalizer.fit_transform(x_train)
x_test_normed = normalizer.transform(x_test) 

normalizer = preprocessing.Normalizer()
y_train_normed = normalizer.fit_transform(y_train)
y_test_normed = normalizer.transform(y_test) 

In [None]:
LogReg = LogisticRegression()
LogReg.fit(x_train_normed, y_train)

In [None]:
LogReg.score(x_train_normed, y_train)

In [None]:
LogReg.score(x_test_normed, y_test)

so there it is. 

In [None]:
LogReg_summary = pd.DataFrame(x_train.columns.values, columns=['Features'])

In [None]:
LogReg_summary

In [None]:
coefs = LogReg.coef_
coefs.shape

In [None]:
coefs = coefs.reshape(137,1)

In [None]:
LogReg_summary['coefs'] = coefs

In [None]:
LogReg_summary.sort_values('coefs')

Go back through but be smarter with the categorical variables at the beginning. 

In [None]:
from sklearn.feature_selection import f_regression

In [None]:
f_regression(x_train_normed, y_train)

In [None]:
p_values = f_regression(x_train_normed, y_train)[1]

In [None]:
LogReg_summary['p_value'] = p_values.round(4)

In [None]:
LogReg_summary.sort_values('p_value')

# Results

so some significant factors and many insignificant ones.

people with lower performance scores seem to be 