Kaggle: https://www.kaggle.com/jiangzuo/hr-comma-sep

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
TRAIN_DIR = '../input/HR_comma_sep.csv'

# Explantory Data Analyisis

In [None]:
def plot_barchart_for_y_by_categorical_column(dataset, y_col, x_col):
    assert len(dataset[x_col].unique()) <= 4, \
        "Input x_col should be categorical column with unique values fewer than 5.'"
    
    result_dict = {}
    for x_col_val in dataset[x_col].unique():
        result_dict[x_col_val] = dataset[y_col][dataset[x_col] == x_col_val].value_counts()
        
    df = pd.DataFrame(result_dict)
    df.plot(kind='bar', stacked=True)
    plt.title(f"{y_col} by {x_col}")
    plt.xlabel(f"{y_col}") 
    plt.ylabel("Count")
    plt.show()

In [None]:
def plot_hist_for_numeric_column_by_y(dataset, y_col, x_col):
    assert sorted(list(dataset[y_col].unique())) == [0, 1], "Input y_col should be 0 or 1.'"
        
    dataset[x_col].hist()  
    plt.ylabel(y_col) 
    plt.xlabel(x_col) 
    plt.title(f'{x_col} Distribution')
    plt.show() 

    dataset[dataset[y_col]==0][x_col].hist()  
    plt.ylabel(y_col) 
    plt.xlabel(x_col) 
    plt.title(f'{x_col} Distribution, {y_col}=0')
    plt.show()

    dataset[dataset[y_col]==1][x_col].hist()  
    plt.ylabel(y_col) 
    plt.xlabel(x_col) 
    plt.title(f'{x_col} Distribution, {y_col}=1')
    plt.show()

1.Take a Glance at the Data

In [None]:
train = pd.read_csv(TRAIN_DIR)

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.shape

2.Look at Categorical Columns

In [None]:
# Use Bar Chart
plot_barchart_for_y_by_categorical_column(train, y_col='left', x_col='salary')

In [None]:
# Use Cross Tab
ct = pd.crosstab(train['salary'], train['left'])
ct.div(ct.sum(axis=1), axis=0)

In [None]:
# Use Cross Tab
ct = pd.crosstab(train['salary'], train['left'])
ct.div(ct.sum(axis=1), axis=0)

Observation: Salary is a useful column. High salary means unlikely to resign.

3.Look at Numeric Columns

In [None]:
plot_hist_for_numeric_column_by_y(train, y_col='left', x_col='satisfaction_level')

Observation: Satisfaction level is a useful column. High satisfaction means unlikely to resign.

# Prepare Data

In [None]:
from patsy import dmatrices
from sklearn.model_selection import train_test_split

In [None]:
def normalize_0_1(X):
    _, num_col = X.shape
    for i in range(1, num_col): # Don't normalize the first column (intercept).
        col_min, col_max = X[:, i].min(), X[:, i].max()
        X[:, i] = (X[:, i] - col_min) / (col_max - col_min)

1.Use DMATRICES to Create Training Set Easily (Use Dummy Variables for Categorical Variables)

In [None]:
y, X = dmatrices('left~satisfaction_level+last_evaluation+number_project+average_montly_hours+time_spend_company+Work_accident+promotion_last_5years+C(sales)+C(salary)', train, return_type='dataframe')
X = X.rename(columns = {
    'C(sales)[T.RandD]': 'Department: Random',
    'C(sales)[T.accounting]': 'Department: Accounting',
    'C(sales)[T.hr]': 'Department: HR',
    'C(sales)[T.management]': 'Department: Management',
    'C(sales)[T.marketing]': 'Department: Marketing',
    'C(sales)[T.product_mng]': 'Department: Product_Management',
    'C(sales)[T.sales]': 'Department: Sales',
    'C(sales)[T.support]': 'Department: Support',
    'C(sales)[T.technical]': 'Department: Technical',
    'C(salary)[T.low]': 'Salary: Low',
    'C(salary)[T.medium]': 'Salary: Medium'})

In [None]:
X.head()

In [None]:
X_np = np.asmatrix(X)
y_np = np.ravel(y)

2.Normalize into 0~1

In [None]:
normalize_0_1(X_np)

3.Training Set, Cross Validation Set

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_np, y_np, test_size=0.2, random_state=0)
print(f'Training Set - X train shape: {X_train.shape}, y train shape: {y_train.shape}')
print(f'Validation Set - X val shape: {X_val.shape}, y val shape: {y_val.shape}')

# Model 1: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
def plot_confusion_matrix(conf_mtrx, classes, cmap=plt.cm.Blues):
    num_class = conf_mtrx.shape[0]
    
    fig, ax = plt.subplots()
    im = ax.imshow(conf_mtrx, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(num_class), yticks=np.arange(num_class),
           xticklabels=classes, yticklabels=classes, 
           ylabel='True label', xlabel='Predicted label')

    middle_threshold = conf_mtrx.max() / 2.
    for row in range(num_class):
        for col in range(num_class):
            ax.text(col, row, format(conf_mtrx[row, col], '.0f'), ha="center", va="center",
                    color="white" if conf_mtrx[row, col] > middle_threshold else "black")
    fig.tight_layout()
    plt.show()

1.Fit

In [None]:
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)
log_reg.score(X_train, y_train)

In [None]:
pd.DataFrame(list(zip(X.columns, np.transpose(log_reg.coef_))))

2.10-fold Cross Validation to Verify

In [None]:
print(cross_val_score(LogisticRegression(max_iter=10000), X_np, y_np, scoring='accuracy', cv=10))

3.Confusion Matrix of Prediction

In [None]:
y_pred = log_reg.predict(X_val)
print('Accuracy: ', accuracy_score(y_val, y_pred))
print('Confusion Matrix')
print(confusion_matrix(y_val, y_pred))
print('Classification Report')
print(classification_report(y_val, y_pred))

In [None]:
plot_confusion_matrix(confusion_matrix(y_val, y_pred),classes=range(2))

# Model 2: Logistic Regression (No Sklearn)

In [None]:
class LogisticRegression:
    """Logistic Regression with both GradientDescent and Newton's Method.
    Example usage:
        > log_reg = LogisticRegression()
        > log_reg.fit(x_train, y_train)
        > log_reg.predict(x_eval)
    """
    def __init__(self, learning_rate=0.1, max_iter=100, solver='GD', theta_0=None, verbose=True):
        """
        Args:
            learning_rate: Step size for iterative solvers only.
            max_iter: Maximum number of iterations for the solver.
            solver: 'GD' - Gradient Descent | 'Newton' - Newton's Method
            theta_0: Initial guess for theta. If None, use the zero vector.
            verbose: Print loss and accuracy values during training.
        """
        assert solver == 'GD' or solver == 'Newton', 'Unknown solver'
        
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.solver = solver          
        self.theta = theta_0
        self.verbose = verbose

    def fit(self, x_train, y_train, x_val, y_val):
        """Minimize loss(theta) for logistic regression.
        Args:
            x_train: Training example inputs. Shape (n_examples, dim).
            y_train: Training example labels. Shape (n_examples,).
        """
        _, num_features = x_train.shape
        self.theta = np.zeros(num_features) if self.theta is None else self.theta        
        for i in range(0, self.max_iter+1):
            if self.solver == 'GD':
                self.__update_theta_via_gradient_descent(x_train, y_train)
            else:
                self.__update_theta_via_newton_method(x_train, y_train)
            loss_train = self.__calculate_loss(x_train, y_train)  
            accuracy_train = self.__calculate_accuracy(x_train, y_train)               
            accuracy_val = self.__calculate_accuracy(x_val, y_val)                              
            if self.verbose and i%10 == 0:
                print(f'Iteration {i} : Loss {loss_train:.4f}  | ' + \
                      f'Train Accuarcy {accuracy_train:.4f} | '+\
                      f'Validation Accuarcy {accuracy_val:.4f}')
                        
    def predict(self, x):
        """Return predicted probabilities given new inputs x.
        Args:
            x: Inputs of shape (n_examples, dim).
        Returns:
            Output Shape (n_examples,).
        """
        pred = 1. / (1+np.exp(-x.dot(self.theta)))
        return self.__ravel_np_matrix(pred)
    
    def __update_theta_via_gradient_descent(self, x, y):
        """Update theta via gradient descent (only one step).
        Args:
            x: Inputs Shape (n_examples, dim).
            y: Inputs Shape (n_examples,).
        """
        num_examples, _ = x.shape     
        y_pred = self.predict(x)
        gradient = - (1/num_examples) * self.__ravel_np_matrix(x.T.dot(y - y_pred))
        self.theta -= self.learning_rate * gradient
        
    def __update_theta_via_newton_method(self, x, y):
        """Update theta via gradient descent (only one step).
        Args:
            x: Inputs Shape (n_examples, dim).
            y: Inputs Shape (n_examples,).
        """
        num_examples, _ = x.shape     
        y_pred = self.predict(x)
        gradient = - (1/num_examples) * x.T.dot(y - y_pred)
        hessian = (1/num_examples) * x.T.dot(np.diag(y_pred*(1-y_pred))).dot(x)
        self.theta -= self.learning_rate * \
                 self.__ravel_np_matrix(np.linalg.inv(hessian).dot(gradient.T))
        
    def __calculate_loss(self, x, y):
        """Calculate loss based on dataset (x, y).
        Args:
            x: Inputs Shape (n_examples, dim).
            y: Inputs Shape (n_examples,).
        Returns:
            Outputs Shape scalar.
        """
        num_examples, _ = x.shape
        y_pred = self.predict(x)
        y_pred_and_y = list(zip(y_pred, y))
        loss = - (1/num_examples) * \
               sum([np.log(y_pred) if y == 1 else np.log(1-y_pred) \
                    for y_pred, y in y_pred_and_y])
        return loss
    
    def __calculate_accuracy(self, x, y):
        """Calculate accuracy based on dataset (x, y).
        Args:
            x: Inputs Shape (n_examples, dim).
            y: Inputs Shape (n_examples,).
        Returns:
            Outputs Shape scalar.
        """
        num_examples, _ = x.shape  
        y_pred = self.predict(x)
        y_pred_and_y = list(zip(y_pred, y))
        accuracy = (1/num_examples) * \
                   sum([1 if (y_pred > 0.5 and y == 1) or (y_pred <= 0.5 and y == 0) else 0 \
                        for y_pred, y in y_pred_and_y])
        return accuracy
    
    def __ravel_np_matrix(self, np_matrix):
        """Ravel a np.matrix (1, n) into (n,).
        Args:
            np_matrix: Inputs Shape (1, n).
        Returns:
            Output Shape (n,).
        """
        return np.array(np_matrix).ravel()

1.Fit

In [None]:
log_reg = LogisticRegression(learning_rate=0.5, solver='GD', max_iter=200)
log_reg.fit(X_train, y_train, X_val, y_val)

Observation: For solver as gradient descent, 90 iterations is enough. Otherwise, there will be overfitting.

In [None]:
log_reg = LogisticRegression(learning_rate=0.5, solver='Newton', max_iter=30)
log_reg.fit(X_train, y_train, X_val, y_val)

Observation: Newton method converges within fewer iterations. But each iteration takes longer (due to the calculation of Heissan matrix).

2.Confusion Matrix of Prediction

In [None]:
y_pred = log_reg.predict(X_val)
y_pred = [1 if y > 0.5 else 0 for y in y_pred]
print('Accuracy: ', accuracy_score(y_val, y_pred))
print('Confusion Matrix')
print(confusion_matrix(y_val, y_pred))
print('Classification Report')
print(classification_report(y_val, y_pred))

In [None]:
plot_confusion_matrix(confusion_matrix(y_val, y_pred),classes=range(2))