# Rotacion de empleados en una empresa
## Visualizacion de los datos que componen el dataset

In [1]:
import sklearn

In [2]:
import pandas as pd
hr = pd.read_csv('HR_comma_sep.csv')
col_names = hr.columns.tolist()
print("Nombre de las columnas:")
print(col_names)
print("\nDatos de ejemplo:")
hr.head()

Nombre de las columnas:
['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years', 'sales', 'salary']

Datos de ejemplo:


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
# Renombramiento de sales a departamento para que el nombre sea más representativo
hr=hr.rename(columns = {'sales':'departamento'})

### Tipos de los datos

In [4]:
hr.dtypes

satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
departamento              object
salary                    object
dtype: object

### Se verifica si hay datos nulos en el dataset

In [5]:
hr.isnull().any()

satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
left                     False
promotion_last_5years    False
departamento             False
salary                   False
dtype: bool

### Cantidad de registros en el dataset

In [6]:
hr.shape

(14999, 10)

Todos los departamentos de la empresa en el dataset

In [7]:
hr['departamento'].unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

Para reducir el número de departamentos, se combinan el área de soporte, IT y técnica en una sola área tecnica

In [8]:
import numpy as np
hr['departamento']=np.where(hr['departamento'] =='support', 'technical', hr['departamento'])
hr['departamento']=np.where(hr['departamento'] =='IT', 'technical', hr['departamento'])

In [9]:
hr['left'].value_counts()


0    11428
1     3571
Name: left, dtype: int64

In [10]:
hr.groupby('left').mean()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
left,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251
1,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,0.005321


In [11]:
hr.groupby('departamento').mean()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
departamento,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
RandD,0.619822,0.712122,3.853875,200.800508,3.367217,0.170267,0.153748,0.034307
accounting,0.582151,0.717718,3.825293,201.162973,3.522816,0.125163,0.265971,0.018253
hr,0.598809,0.70885,3.654939,198.684709,3.355886,0.120433,0.290934,0.020298
management,0.621349,0.724,3.860317,201.249206,4.303175,0.163492,0.144444,0.109524
marketing,0.618601,0.715886,3.687646,199.385781,3.56993,0.160839,0.236597,0.050117
product_mng,0.619634,0.714756,3.807095,199.965632,3.47561,0.146341,0.219512,0.0
sales,0.614447,0.709717,3.776329,200.911353,3.534058,0.141787,0.244928,0.024155
technical,0.613687,0.720976,3.839054,201.813795,3.416127,0.144106,0.246924,0.008258


In [12]:
hr.groupby('salary').mean()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
salary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,0.63747,0.704325,3.767179,199.867421,3.692805,0.155214,0.066289,0.058205
low,0.600753,0.717017,3.799891,200.996583,3.438218,0.142154,0.296884,0.009021
medium,0.621817,0.717322,3.813528,201.338349,3.52901,0.145361,0.204313,0.028079


## Definiendo Categorias
Se separa departamento en columnas para poder utilizarlas en el entrenamiento de los modelos

In [13]:
cat_vars=['departamento','salary']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(hr[var], prefix=var)
    hr1=hr.join(cat_list)
    hr=hr1

In [14]:
hr.drop(hr.columns[[8, 9]], axis=1, inplace=True)
hr.columns.values

array(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'left', 'promotion_last_5years', 'departamento_RandD',
       'departamento_accounting', 'departamento_hr',
       'departamento_management', 'departamento_marketing',
       'departamento_product_mng', 'departamento_sales',
       'departamento_technical', 'salary_high', 'salary_low',
       'salary_medium'], dtype=object)

La variable a predecir es ```left``` y las demás variables se utilizarán como entradas para el modelo

In [15]:
hr_vars=hr.columns.values.tolist()
y=['left']
X=[i for i in hr_vars if i not in y]

## Seleccion de parametros
La eliminación de características recursivas (RFE) funciona mediante la eliminación recursiva de variables y la creación de un modelo en las variables que permanecen. Utiliza la precisión del modelo para identificar qué variables (y combinación de variables) contribuyen más a la predicción de la variable objetivo.

In [16]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier()
model = LogisticRegression()
rfe = RFE(model, 10)
rfe = rfe.fit(hr[X], hr[y])
print(rfe.support_)
print(rfe.ranking_)

  y = column_or_1d(y, warn=True)


[ True  True False False  True  True  True  True False  True  True False
 False False False  True  True False]
[1 1 3 9 1 1 1 1 5 1 1 6 8 7 4 1 1 2]


Se obtuvieron 10 variables como las más relevantes para el modelo
El primer array marca en ```True``` cuales son y el segundo muestra un ranking de las mismas

In [17]:
cols=['satisfaction_level', 'last_evaluation', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 
      'departamento_RandD', 'departamento_hr', 'departamento_management', 'salary_high', 'salary_low'] 
X=hr[cols]
y=hr['left']

### Regresion Logisitica

In [18]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
from sklearn.metrics import accuracy_score
print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(y_test, logreg.predict(X_test))))

Logistic regression accuracy: 0.771


### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(y_test, rf.predict(X_test))))

Random Forest Accuracy: 0.977


### Support Vector Machine

In [22]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
print('Support vector machine accuracy: {:.3f}'.format(accuracy_score(y_test, svc.predict(X_test))))

Support vector machine accuracy: 0.909


## Comparación de los modelos
Se puede observar que el modelo con mayor precisión a la hora de predecir la salida de los empleados de una empresa es el modelo de Random Forest

## Cross Validation
Se aplica Cross Validation para medir la precisión promedia de Random Forest

In [24]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = RandomForestClassifier()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

10-fold cross validation average accuracy: 0.978


## Matrices de Precisión y Sensibilidad (recall) de los modelos
Una vez se entrenaron los tres modelos, podemos analizar la precisión y sensibilidad de cada uno

### Random Forest

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rf.predict(X_test)))

             precision    recall  f1-score   support

          0       0.99      0.98      0.99      3462
          1       0.95      0.95      0.95      1038

avg / total       0.98      0.98      0.98      4500



### Logistic Regression

In [26]:
print(classification_report(y_test, logreg.predict(X_test)))

             precision    recall  f1-score   support

          0       0.81      0.92      0.86      3462
          1       0.51      0.26      0.35      1038

avg / total       0.74      0.77      0.74      4500



### Support Vector Machine

In [27]:
print(classification_report(y_test, svc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.96      0.92      0.94      3462
          1       0.77      0.86      0.81      1038

avg / total       0.91      0.91      0.91      4500



## Importancia de los features para Random Forest
Eligiendo el modelo de RandomForest, que presentó la mayor precisión de entre los tres, podemos determinar cuales variables fueron las más determinantes a la hora de decidir si un empleado permanecía o salía de la empresa

In [28]:
feature_labels = np.array(['satisfaction_level', 'last_evaluation', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 
      'department_RandD', 'department_hr', 'department_management', 'salary_high', 'salary_low'])
importance = rf.feature_importances_
feature_indexes_by_importance = importance.argsort()
for index in feature_indexes_by_importance:
    print('{}-{:.2f}%'.format(feature_labels[index], (importance[index] *100.0)))

promotion_last_5years-0.25%
department_management-0.27%
department_RandD-0.28%
department_hr-0.31%
salary_high-0.87%
salary_low-1.11%
Work_accident-1.51%
last_evaluation-19.81%
time_spend_company-28.30%
satisfaction_level-47.29%
