# Classification models to predict if an employee will leave or not.

In [None]:
# Importing SparkSession libraries.
# Creating an instance 'logReg' for the SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('logreg').getOrCreate()

In [None]:
# Reading the data from csv file into the dataframe 'data'
data = spark.read.csv('HR_comma_sep.csv', inferSchema=True, header=True)
# Printing the schema of the file
data.printSchema()

In [None]:
# Describing the statistical summary of the data
data.describe().show()

In [None]:
# Displays only the features
data.columns

In [None]:
# Importing Vectors, VectorAssembler, Pipeline and StringIndexer libraries
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

In [None]:
# StringIndexer - A label indexer that maps a string column of labels to an ML column of label indices.
# If the input column is numeric, we cast it to string and index the string values.
# The indices are in [0, numLabels), ordered by label frequencies. So the most frequent label gets index 0.
indexer = StringIndexer(inputCol='salary', outputCol='salary_in')
# Fits a model to the input dataset with optional parameters.
# Transforms the input dataset with optional parameters.
indexed = indexer.fit(data).transform(data)
indexed.columns

In [None]:
# VectorAssembler - A feature transformer that merges multiple columns into a vector column.
assembler = VectorAssembler(inputCols=['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years',
 'salary_in'
 ], outputCol='features')

In [None]:
#transform() - Transforms the input dataset with optional parameters and returns transformed dataset.
output = assembler.transform(indexed)
final_data=output.select('features', 'left')
# randomSplit - Randomly splits this RDD with the provided weights and returns split RDDs in a list. 
train_churn, test_churn=final_data.randomSplit([0.7,0.3])

## Logistic Regression Classifier

In [None]:
# Importing Logistic Regression libraries
from pyspark.ml.classification import LogisticRegression

In [None]:
# Logistic regression - supports multinomial logistic (softmax) and binomial logistic regression.
lr_churn=LogisticRegression(maxIter=30, regParam=0.0, elasticNetParam=0.0, labelCol='left')

In [None]:
# fit() - Fits a model to the input dataset with optional parameters and returns fitted model.
fitted_churn_model = lr_churn.fit(train_churn)
training_summary = fitted_churn_model.summary
# predictions - Predictions associated with the boundaries at the same index, monotone because of isotonic regression.
training_summary.predictions.describe().show()

## Logistic Regression Classifier - Binary Classification Evaluator

In [None]:
# Importing BinaryClassificationEvaluator Libraries for evaluating the Logistic Regression model 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# evaluate() - Evaluates the output with optional parameters and returns a metric.
pred_and_labels = fitted_churn_model.evaluate(test_churn)
pred_and_labels.predictions.show()

In [None]:
# Applied the BinaryClassificationEvaluator - Evaluator for binary classification, which expects two input columns:
# rawPrediction and label.
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='left')
auc = churn_eval.evaluate(pred_and_labels.predictions)
auc

## Random Forest Classifier

In [None]:
# Importing RandomForestClassifier libraries
from pyspark.ml.classification import RandomForestClassifier

In [None]:
# Spliting the data randomly into training and testing data.
splitSeed = 5043
train_data, test_data=output.randomSplit([0.7,0.3], splitSeed)

In [None]:
# Applied the RandomForestClassifier technique with parameters to tweak
classifier = RandomForestClassifier(impurity="entropy", numTrees=20, maxDepth=30, seed=5043, labelCol="left")
# Fits a model to the input dataset with optional parameters.
model = classifier.fit(train_data)

## Random Forest - Multiclass Classification Evaluator

In [None]:
# Importing Multiclass Classification Evaluator libraries
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Transforms the input dataset with optional parameters.
predictions = model.transform(test_data)
predictions.select("satisfaction_level", "left", "prediction").show()

In [None]:
# Applied the Multiclass Classification Evaluator
data_eval = MulticlassClassificationEvaluator(labelCol='left')
# Evaluation of the predicted values on a scale of 0 to 1
auc1 = data_eval.evaluate(predictions)
auc1

## Naive Bayes Classifier

In [None]:
# Importing Naive Bayes libraries
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel

In [None]:
# Spliting the data randomly into training and testing data.
train_data2, test_data2=output.randomSplit([0.7,0.3])

In [None]:
# Applied the NaiveBayes technique with parameters to tweak
classifier = NaiveBayes(smoothing=0.2, modelType="multinomial", labelCol="left")
# Fits a model to the input dataset with optional parameters.
model = classifier.fit(train_data2)

## Naive Bayes - Multiclass Classification Evaluator

In [None]:
# Importing Multiclass Classification Evaluator libraries
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Transforms the input dataset with optional parameters.
predictions = model.transform(test_data2)
predictions.select("satisfaction_level", "left", "prediction").show()

In [None]:
# Applied the Multiclass Classification Evaluator
data_eval = MulticlassClassificationEvaluator(labelCol='left')
# Evaluation of the predicted values on a scale of 0 to 1
auc1 = data_eval.evaluate(predictions)
auc1

##### Among all the three classification techniques, Random Forest Classification technique gave the most accurate predicted model with approximately 0.98 accuracy.