# Task for Today  

***

## Employee Termination Prediction  

Given *employee data from HR*, let's try to predict which employees are most likely to be **terminated**.

We will use various classification models to make our predictions. 

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
data = pd.read_csv('../input/human-resources-data-set/HRDataset_v14.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def encode_dates(df, columns_with_prefixes):
    df = df.copy()
    
    for column, prefix in columns_with_prefixes:
        df[column] = pd.to_datetime(df[column])
        
        df[prefix + '_year'] = df[column].apply(lambda x: x.year)
        df[prefix + '_month'] = df[column].apply(lambda x: x.month)
        df[prefix + '_day'] = df[column].apply(lambda x: x.day)
        
        df = df.drop(column, axis=1)
        
    return df

def ordinal_encode(df, columns_with_orderings):
    df = df.copy()
    
    for column, ordering in columns_with_orderings:
        df[column] = df[column].apply(lambda x: ordering.index(x))
        
    return df

def onehot_encode(df, columns_with_prefixes):
    df = df.copy()
    
    for column, prefix in columns_with_prefixes:
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    return df

In [None]:
def preprocess_inputs(df, scaler):
    df = df.copy()
    
    # Drop unneeded columns
    df = df.drop(['Employee_Name', 'EmpID', 'MaritalStatusID', 'Sex', 'PositionID', 'DeptID', 'PerfScoreID', 'EmpStatusID', 'EmploymentStatus', 'DateofTermination', 'TermReason', 'ManagerID'], axis=1)
    ''
    # Date encoding
    date_columns = [
        ('DOB', "DOB"),
        ('DateofHire', "DOH"),
        ('LastPerformanceReview_Date', "PRD")
    ]
    df = encode_dates(df, columns_with_prefixes=date_columns)
    
    # Ordinal encoding
    ordinal_columns = [
        ('PerformanceScore', ['PIP', 'Needs Improvement', 'Fully Meets', 'Exceeds'])
    ]
    df = ordinal_encode(df, columns_with_orderings=ordinal_columns)
    
    # One-hot encoding
    nominal_columns = [
        ('Position', "PS"),
        ('State', "ST"),
        ('Zip', "ZP"),
        ('MaritalDesc', "MD"),
        ('CitizenDesc', "CD"),
        ('HispanicLatino', "HL"),
        ('RaceDesc', "RD"),
        ('Department', "DE"),
        ('ManagerName', "MN"),
        ('RecruitmentSource', "RS")
    ]
    df = onehot_encode(df, columns_with_prefixes=nominal_columns)
    
    # Split df into X and y
    y = df['Termd'].copy()
    X = df.drop('Termd', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)
    
    # Scale X
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = preprocess_inputs(data, scaler)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
models = {
    "   K-Nearest Neighbors": KNeighborsClassifier(),
    "   Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "         Decision Tree": DecisionTreeClassifier(),
    "        Neural Network": MLPClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

# Results

In [None]:
for name, model in models.items():
    print(name + " Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/v0XdNicMXuY