In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/human-resources-data-set"))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('../input/human-resources-data-set/HRDataset_v14.csv')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
def mapper(field, new_field):
    keys = field.unique()
    dicts = dict(zip(keys, range(len(keys))))
    print(dicts)
    df[new_field] = field.map(dicts).astype(int)

df['HispanicLatino'] = df['HispanicLatino'].replace('no' ,'No')
df['HispanicLatino'] = df['HispanicLatino'].replace('yes' ,'Yes')    

    
mapper(df.State, 'MappedState')
mapper(df.HispanicLatino, 'MappedHispanicLatino')
mapper(df.RaceDesc, 'MappedRaceDesc')
mapper(df.TermReason, 'MappedTermReason')
mapper(df.RecruitmentSource, 'MappedRecruitmentSource')


In [None]:
df.sample(15)

In [None]:
# Create age column

df['Age'] = 2020 - (1900 + df['DOB'].str[-2:].astype('int64'))
df['Age'].sample(150)

In [None]:
# Age vs score?

plt.scatter(y = df.Age, x = df.PerfScoreID)

In [None]:
# Average performance score per Department
plt.figure(figsize=(20, 9))
sns.boxplot("PerfScoreID", "Age", data=df)
plt.xticks(rotation = 45)
plt.title('Age dist on Age');

In [None]:
# Is there a difference between men and women?
plt.figure(figsize=(15,9))
sns.kdeplot(df.PerfScoreID[df.Sex=='M '], label='men', shade=True, bw=0.4)
sns.kdeplot(df.PerfScoreID[df.Sex=='F'], label='women', shade=True, bw=0.4)
plt.xlabel('PerfScoreID');

In [None]:
average_male = df.PerfScoreID[df.Sex == 'M '].mean()
average_female = df.PerfScoreID[df.Sex == 'F'].mean()
print('Mean of absenthours for male: ', average_male)
print('Mean of absenthours for female: ', average_female)

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(28,28))
plt.title('Correlation of Features', y=1.0, size=10)
sns.heatmap(df.corr(),linewidths=0.2,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
sns.distplot(df.Age)

In [None]:
X = df.drop(['EmpID', 'Employee_Name', 'Zip', 'ManagerID', 'PerfScoreID', 'Position',
             'Sex', 'State', 'Zip', 'DOB', 'MaritalDesc', 'CitizenDesc', 'HispanicLatino',
            'RaceDesc', 'DateofHire', 'DateofTermination', 'TermReason', 'EmploymentStatus',
            'Department', 'ManagerName', 'RecruitmentSource', 'PerformanceScore', 'LastPerformanceReview_Date'], axis=1)
y = df['PerfScoreID'].values

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X = StandardScaler().fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [None]:
# Quick death match between a bunge of regressors
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet, Lars
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error


regressors = [DecisionTreeRegressor(), ExtraTreeRegressor(), #LogisticRegression(),
AdaBoostRegressor(), GradientBoostingRegressor(), ExtraTreesRegressor(), RandomForestRegressor(),
Ridge(alpha=1.5), Lasso(alpha=1.5), LinearRegression(), ElasticNet(), Lars()]

log_cols=["regressors", "MSE"]
log = pd.DataFrame(columns=log_cols)

for rgr in regressors:
    rgr.fit(X_train, y_train)
    name = rgr.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = rgr.predict(X_test)
    mse_train = np.sqrt(mean_squared_error(y_train, rgr.predict(X_train)))
    mse_test = np.sqrt(mean_squared_error(y_test, train_predictions))
    print("RMSE_train: {}".format(mse_train))
    print("RMSE_test: {}".format(mse_test))
    
    
    log_entry = pd.DataFrame([[name, mse_test]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

In [None]:
sns.set_color_codes("muted")
sns.barplot(x='MSE', y='regressors', data=log, color="g")

plt.xlabel('RMSE')
plt.title('regressor RMSE')
plt.show()

In [None]:
from sklearn.datasets import fetch_openml
import forestci as fci

# Random forest is the winner, lets look further into that

cls = RandomForestRegressor().fit(X_train, y_train)

# Plot predicted MPG without error bars
y_hat = cls.predict(X_test)
plt.figure(figsize=(15,15))
plt.scatter(y_test, y_hat)
plt.plot([0, 5], [0, 5], 'k--')
plt.xlabel('Reported Score')
plt.ylabel('Predicted Score')
plt.show()

# Calculate the variance
V_IJ_unbiased = fci.random_forest_error(cls, X_train,
                                            X_test)

# Plot error bars for predicted MPG using unbiased variance
plt.figure(figsize=(15,15))
plt.errorbar(y_test, y_hat, yerr=np.sqrt(V_IJ_unbiased), fmt='o')
plt.plot([0, 5], [0, 5], 'k--')
plt.xlabel('Reported Score')
plt.ylabel('Predicted Score')
plt.show()

In [None]:
X[1,:]

In [None]:
t = np.arange(18.0, 65.0, 1)
a0 = 3
f0 = 3
X_sld[:,-4] = a0
X_sld[:,-5] = f0
y_sld = cls.predict(X_sld)
l, = plt.plot(t, y_sld, lw=2)

In [None]:
from matplotlib.widgets import Slider, Button, RadioButtons
%matplotlib qt

fig, ax = plt.subplots()
plt.subplots_adjust(left=0.25, bottom=0.25)
t = np.arange(18.0, 65.0, 1)
t1 = (t-scaler.mean_[-1])/scaler.scale_[-1]
a0 = 3
f0 = 3
X_sld = np.repeat(np.expand_dims(X_train[1,:], axis=0), t.size, axis=0)
X_sld[:,-1] = t
X_sld[:,-4] = a0
X_sld[:,-5] = f0
y_sld = cls.predict(X_sld)
l, = plt.plot(t, y_sld, lw=2)
ax.margins(x=0)

axcolor = 'lightgoldenrodyellow'
axfreq = plt.axes([0.25, 0.1, 0.65, 0.03], facecolor=axcolor)
axamp = plt.axes([0.25, 0.15, 0.65, 0.03], facecolor=axcolor)

SEmpSatisfaction = Slider(axfreq, 'Employee Satisfaction', 1, 5, valinit=f0, valstep=1)
sEngagementSurvey = Slider(axamp, 'Engagement Survey Score', 0.1, 5.0, valinit=a0, valstep=0.1)


def update(val):
    EmpSat = SEmpSatisfaction.val
    EngSurv = sEngagementSurvey.val
    X_sld[:,-4] = EmpSat
    X_sld[:,-5] = EmpSurv
    l.set_ydata(cls.predict(X_sld))
    fig.canvas.draw_idle()


SEmpSatisfaction.on_changed(update)
sEngagementSurvey.on_changed(update)

resetax = plt.axes([0.8, 0.025, 0.1, 0.04])
button = Button(resetax, 'Reset', color=axcolor, hovercolor='0.975')


def reset(event):
    SEmpSatisfaction.reset()
    sEngagementSurvey.reset()
button.on_clicked(reset)

rax = plt.axes([0.025, 0.5, 0.15, 0.15], facecolor=axcolor)
radio = RadioButtons(rax, ('red', 'blue', 'green'), active=0)


def colorfunc(label):
    l.set_color(label)
    fig.canvas.draw_idle()
radio.on_clicked(colorfunc)

plt.show()