#Machine Learning - Notebook

1. Load the data.
2. Train the data on the classification models.
3. Test the model using testing data.
4. Display the confusion matrix.
5. Calculate the metrics.
6. Save the model.
7. Create a report and save it in Azure ADLS

## Data Fetching

In [0]:
%run "./Secrets and variables"

In [0]:
if not any(mnt.mountPoint == mount_point for mnt in dbutils.fs.mounts()):
    dbutils.fs.mount(
        source = f"wasbs://{container_name}@{storage_acct}.blob.core.windows.net",
        mount_point = mount_point,
        extra_configs = {
            f"fs.azure.account.key.{storage_acct}.blob.core.windows.net": access_key
        }
    )

In [0]:
train_data = spark.read.format("parquet").load(f"{mount_point}/transformed_data/balanced_train_data.parquet/part*.parquet")
test_data = spark.read.format("parquet").load(f"{mount_point}/transformed_data/test_data.parquet/part*.parquet")

## Machine Learning

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier, GBTClassifier,MultilayerPerceptronClassifier, LinearSVC, LogisticRegression

In [0]:
num_of_features = train_data.select("features").first()[0].size


Model Declaration

In [0]:
lr = LogisticRegression(labelCol='Attrition_Cat')
decisionTree = DecisionTreeClassifier(featuresCol='features', labelCol='Attrition_Cat')
randomForest = RandomForestClassifier(featuresCol='features', labelCol='Attrition_Cat', numTrees=15)
gbt = GBTClassifier(featuresCol='features', labelCol='Attrition_Cat', maxIter=15)
mlp = MultilayerPerceptronClassifier(featuresCol='features', labelCol='Attrition_Cat', maxIter=100, layers=[num_of_features, 7, 4, 2])
svc = LinearSVC(featuresCol='features', labelCol='Attrition_Cat')

##Model Training

In [0]:
model = lr.fit(train_data)

In [0]:
decisionTreeModel = decisionTree.fit(train_data)

In [0]:
randomForestModel = randomForest.fit(train_data)

In [0]:
gbtModel = gbt.fit(train_data)

In [0]:
mlpModel = mlp.fit(train_data)

In [0]:
svcModel = svc.fit(train_data)

##Model Prediction

In [0]:
res = model.transform(test_data)

In [0]:
resDecisionTreeModel = decisionTreeModel.transform(test_data)

In [0]:
resRandomForestModel = randomForestModel.transform(test_data)

In [0]:
resGBTModel = gbtModel.transform(test_data)

In [0]:
resMLPModel = mlpModel.transform(test_data)

In [0]:
resSVCModel = svcModel.transform(test_data)

## Evaluation

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
AUC_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Attrition_Cat',metricName='areaUnderROC')

In [0]:
AUC = AUC_evaluator.evaluate(res)
AUC_DecisionTree = AUC_evaluator.evaluate(resDecisionTreeModel)
AUC_RandomForest = AUC_evaluator.evaluate(resRandomForestModel)
AUC_GBT = AUC_evaluator.evaluate(resGBTModel)
AUC_MLP = AUC_evaluator.evaluate(resMLPModel)
AUC_SVC = AUC_evaluator.evaluate(resSVCModel)   

In [0]:
print(f"The AUC for Logistic Regression Model is: {AUC}")
print(f"The AUC for Decision Tree Model is: {AUC_DecisionTree}")
print(f"The AUC for Random Forest Model is: {AUC_RandomForest}")
print(f"The AUC for GBT Model is: {AUC_GBT}")
print(f"The AUC for MLP Model is: {AUC_MLP}")
print(f"The AUC for SVC Model is: {AUC_SVC}")

The AUC for Logistic Regression Model is: 0.8496200607902736
The AUC for Decision Tree Model is: 0.8985562310030396
The AUC for Random Forest Model is: 0.8391717325227963
The AUC for GBT Model is: 0.9153115501519756
The AUC for MLP Model is: 0.8842325227963527
The AUC for SVC Model is: 0.8504179331306991


In [0]:
PR_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Attrition_Cat',metricName='areaUnderPR')

In [0]:
PR = PR_evaluator.evaluate(res)
PR_DecisionTree = PR_evaluator.evaluate(resDecisionTreeModel)
PR_RandomForest = PR_evaluator.evaluate(resRandomForestModel)
PR_GBT = PR_evaluator.evaluate(resGBTModel)
PR_MLP = PR_evaluator.evaluate(resMLPModel)
PR_SVC = PR_evaluator.evaluate(resSVCModel)

In [0]:
print(f"The PR for Logistic Regression Model is: {PR}")
print(f"The PR for Decision Tree Model is: {PR_DecisionTree}")
print(f"The PR for Random Forest Model is: {PR_RandomForest}")
print(f"The PR for GBT Model is: {PR_GBT}")
print(f"The PR for MLP Model is: {PR_MLP}")
print(f"The PR for SVC Model is: {PR_SVC}")

The PR for Logistic Regression Model is: 0.5020241633165629
The PR for Decision Tree Model is: 0.6409701720452
The PR for Random Forest Model is: 0.6065219474793944
The PR for GBT Model is: 0.703751882405743
The PR for MLP Model is: 0.5677754884646652
The PR for SVC Model is: 0.49933611379755


In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
ACC_evaluator = MulticlassClassificationEvaluator(labelCol="Attrition_Cat", predictionCol="prediction", metricName="accuracy")

In [0]:
accuracy = ACC_evaluator.evaluate(res)
Acc_DecisionTree = ACC_evaluator.evaluate(resDecisionTreeModel)
Acc_RandomForest = ACC_evaluator.evaluate(resRandomForestModel)
Acc_GBT = ACC_evaluator.evaluate(resGBTModel)
Acc_MLP = ACC_evaluator.evaluate(resMLPModel)
Acc_SVC = ACC_evaluator.evaluate(resSVCModel)

In [0]:
print(f"The Accuracy for the model is: {accuracy}")
print(f"The Accuracy for Decision Tree Model is: {Acc_DecisionTree}")
print(f"The Accuracy for Random Forest Model is: {Acc_RandomForest}")
print(f"The Accuracy for GBT Model is: {Acc_GBT}")
print(f"The Accuracy for MLP Model is: {Acc_MLP}")
print(f"The Accuracy for SVC Model is: {Acc_SVC}")

The Accuracy for the model is: 0.8649258542875564
The Accuracy for Decision Tree Model is: 0.9168278529980658
The Accuracy for Random Forest Model is: 0.9065119277885235
The Accuracy for GBT Model is: 0.9348807221147647
The Accuracy for MLP Model is: 0.8910380399742102
The Accuracy for SVC Model is: 0.8633139909735654


## Confusion Matrices

In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [0]:
y_true = res.select(['Attrition_Cat'])
y_pred = res.select(['prediction'])

actualFloat = y_true.rdd.map(lambda x: float(x[0]))
predFloat = y_pred.rdd.map(lambda x: float(x[0]))

pairs = actualFloat.zip(predFloat)

metrics = MulticlassMetrics(pairs)
metrics.confusionMatrix().toArray()



array([[2294.,   81.],
       [ 338.,  389.]])

In [0]:
logisticRegressionPrecision = metrics.recall(1.0)
logisticRegressionRecall = metrics.precision(1.0)
logisticRegressionF1Score = metrics.fMeasure(1.0)

In [0]:
y_true = resDecisionTreeModel.select(['Attrition_Cat'])
y_pred = resDecisionTreeModel.select(['prediction'])

actualFloat = y_true.rdd.map(lambda x: float(x[0]))
predFloat = y_pred.rdd.map(lambda x: float(x[0]))

pairs = actualFloat.zip(predFloat)

metrics = MulticlassMetrics(pairs)
metrics.confusionMatrix().toArray()




array([[2434.,   60.],
       [ 198.,  410.]])

In [0]:
decisionTreePrecision = metrics.recall(1.0)
decisionTreeRecall = metrics.precision(1.0)
decisionTreeF1Score = metrics.fMeasure(1.0)

In [0]:
y_true = resRandomForestModel.select(['Attrition_Cat'])
y_pred = resRandomForestModel.select(['prediction'])

actualFloat = y_true.rdd.map(lambda x: float(x[0]))
predFloat = y_pred.rdd.map(lambda x: float(x[0]))

pairs = actualFloat.zip(predFloat)

metrics = MulticlassMetrics(pairs)
metrics.confusionMatrix().toArray()




array([[2463.,  121.],
       [ 169.,  349.]])

In [0]:
randomForestPrecision = metrics.recall(1.0)
randomForestRecall = metrics.precision(1.0)
randomForestF1Score = metrics.fMeasure(1.0)

In [0]:
y_true = resGBTModel.select(['Attrition_Cat'])
y_pred = resGBTModel.select(['prediction'])

actualFloat = y_true.rdd.map(lambda x: float(x[0]))
predFloat = y_pred.rdd.map(lambda x: float(x[0]))

pairs = actualFloat.zip(predFloat)

metrics = MulticlassMetrics(pairs)
metrics.confusionMatrix().toArray()




array([[2483.,   53.],
       [ 149.,  417.]])

In [0]:
gbtPrecision = metrics.recall(1.0)
gbtRecall = metrics.precision(1.0)
gbtF1Score = metrics.fMeasure(1.0)

In [0]:
y_true = resMLPModel.select(['Attrition_Cat'])
y_pred = resMLPModel.select(['prediction'])

actualFloat = y_true.rdd.map(lambda x: float(x[0]))
predFloat = y_pred.rdd.map(lambda x: float(x[0]))

pairs = actualFloat.zip(predFloat)

metrics = MulticlassMetrics(pairs)
metrics.confusionMatrix().toArray()




array([[2353.,   59.],
       [ 279.,  411.]])

In [0]:
mlpPrecision = metrics.recall(1.0)
mlpRecall = metrics.precision(1.0)
mlpF1Score = metrics.fMeasure(1.0)

In [0]:
y_true = resSVCModel.select(['Attrition_Cat'])
y_pred = resSVCModel.select(['prediction'])

actualFloat = y_true.rdd.map(lambda x: float(x[0]))
predFloat = y_pred.rdd.map(lambda x: float(x[0]))

pairs = actualFloat.zip(predFloat)

metrics = MulticlassMetrics(pairs)
metrics.confusionMatrix().toArray()




array([[2287.,   79.],
       [ 345.,  391.]])

In [0]:
svcPrecision = metrics.recall(1.0)
svcRecall = metrics.precision(1.0)
svcF1Score = metrics.fMeasure(1.0)

## Saving the models

In [0]:
model.write().overwrite().save(f"{mount_point}/models/LogisticRegressionModel")
decisionTreeModel.write().overwrite().save(f"{mount_point}/models/DecisionTreeModel")
randomForestModel.write().overwrite().save(f"{mount_point}/models/RandomForestModel")
gbtModel.write().overwrite().save(f"{mount_point}/models/GBTModel")
mlpModel.write().overwrite().save(f"{mount_point}/models/MLPModel")
svcModel.write().overwrite().save(f"{mount_point}/models/SVCModel")

## Report Generation

In [0]:
modelList = {"LogisticRegression": {
    "AUC": AUC,
    "PR": PR,
    "Accuracy": accuracy,
    "Precision": logisticRegressionPrecision,
    "Recall": logisticRegressionRecall,
    "F1": logisticRegressionF1Score
},
             "DecisionTree": {
                "AUC":AUC_DecisionTree,
                "PR": PR_DecisionTree,
                "Accuracy": Acc_DecisionTree,
                "Precision": decisionTreePrecision,
                "Recall": decisionTreeRecall,
                "F1": decisionTreeF1Score
             },
             "RandomForest": {
                 "AUC":AUC_RandomForest,
                 "PR": PR_RandomForest,
                 "Accuracy": Acc_RandomForest,
                 "Precision": randomForestPrecision,
                 "Recall": randomForestRecall,
                 "F1": randomForestF1Score
             },
             "GBT": {
                 "AUC":AUC_GBT,
                 "PR": PR_GBT,
                 "Accuracy": Acc_GBT,
                 "Precision": gbtPrecision,
                 "Recall": gbtRecall,
                 "F1": gbtF1Score
             },
             "MLP": {
                 "AUC":AUC_MLP,
                 "PR": PR_MLP,
                 "Accuracy": Acc_MLP,
                 "Precision": mlpPrecision,
                 "Recall": mlpRecall,
                 "F1": mlpF1Score
                 },
             "SVC": {
                 "AUC":AUC_SVC,
                 "PR": PR_SVC,
                 "Accuracy": Acc_SVC,
                 "Precision": svcPrecision,
                 "Recall": svcRecall,
                 "F1": svcF1Score
             }
}

In [0]:
reportList = []
for modelName, modelMetrics in modelList.items():
    reportList.append((modelName, modelMetrics["AUC"], modelMetrics["PR"], modelMetrics["Accuracy"],modelMetrics["Precision"], modelMetrics["Recall"], modelMetrics["F1"]))

In [0]:
reportDf = spark.createDataFrame(reportList, ["Model", "AUC", "PR", "Accuracy","Precision","Recall","F1 Score"])

In [0]:
reportDf.coalesce(1).write. \
    mode("overwrite"). \
        format("csv"). \
            option("header", "true"). \
                save(f"{mount_point}/reports/model_report.csv")