In [1]:
!pip install pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Logistic Regression').getOrCreate()

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=bd5b92c438da0eb3636154577bfcf6459cb785a41f14ee741733bb6ef95821cf
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


#Binary classification tasks (2 categories only)
Only using a subset of the data to maintain class balance for the classification tasks (to avoid an anomalous detection category task) and to limit the amount of signals in the data as some of the features in the dataset strongly affect the output.

To predict if the user will subscribe to another product or service (term deposit), based on the other attributes, such as age, job, loan, etc. To find the top users who can be targeted by the business for cross-selling or upselling.

In [2]:
df = spark.read.csv('sample_data/bank-test3.csv', inferSchema=True, header=True)
df.count()

10468

In [3]:
df.columns

['age',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'target_class']

In [4]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- target_class: string (nullable = true)



In [5]:
# Convert yeses and noes into 1s and 0s, as well as rename the target_class column to "label"
df.groupBy('target_class').count().show()

+------------+-----+
|target_class|count|
+------------+-----+
|          no| 5179|
|         yes| 5289|
+------------+-----+



In [6]:
# Feature Engineering
from pyspark.sql import functions as F
from pyspark.sql import *
df = df.withColumn("label", F.when(df.target_class == 'no', F.lit(0)).otherwise(F.lit(1)))
df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 5289|
|    0| 5179|
+-----+-----+



In [7]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [8]:
# To create features for the model due to categorical columns, such as job and education. Make use of StringIndexer and OneHotEncoder

def cat_to_num(df):
  for col in df.columns:
    stringIndexer = StringIndexer(inputCol = col, outputCol = col+"_index")

    model = stringIndexer.fit(df)

    indexed = model.transform(df)

    encoder = OneHotEncoder(inputCol = col+"_index", outputCol = col+"_vec")

    ohe = encoder.fit(indexed)

    df = ohe.transform(indexed)

  df_assembler = VectorAssembler(inputCols=['age', 'marital_vec', 'education_vec', 'default_vec', 'housing_vec', 'loan_vec'], outputCol="features")
  df = df_assembler.transform(df)
  return df.select(['features', 'label'])

In [9]:
df_new = cat_to_num(df)
df_new.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[58.0,1.0,0.0,0.0...|    0|
|[44.0,0.0,1.0,1.0...|    0|
|[33.0,1.0,0.0,1.0...|    0|
|[47.0,1.0,0.0,0.0...|    0|
|(9,[0,2,6,8],[33....|    0|
|[35.0,1.0,0.0,0.0...|    0|
|[59.0,1.0,0.0,1.0...|    1|
|[56.0,1.0,0.0,1.0...|    1|
|[41.0,1.0,0.0,1.0...|    1|
|[55.0,1.0,0.0,1.0...|    1|
|[54.0,1.0,0.0,0.0...|    1|
|[42.0,0.0,1.0,0.0...|    1|
|[56.0,1.0,0.0,0.0...|    1|
|[60.0,0.0,0.0,1.0...|    1|
|[39.0,0.0,1.0,0.0...|    1|
|[37.0,1.0,0.0,1.0...|    1|
|[34.0,1.0,0.0,1.0...|    1|
|[55.0,0.0,0.0,1.0...|    1|
|[28.0,0.0,1.0,1.0...|    1|
|[30.0,1.0,0.0,1.0...|    1|
+--------------------+-----+
only showing top 20 rows



In [10]:
# All the input features are now merged into a single dense vector ('features'), along with output column labels, which we can use to train the ML models.
# The new dataframe created using only 2 columns (features, label) is now called df_new. We can now split this new dataframe into train and test datasets.
# We can split the data into a 75%/25% ratio, using the randomSplit function.

# Step 3: Split the Data into Train and Test Datasets
train, test = df_new.randomSplit([0.75, 0.25])
print(f"Size of train Dataset : {train.count()}")

Size of train Dataset : 7901


In [11]:
print(f"Size of test Dataset : {test.count()}")

Size of test Dataset : 2567


In [12]:
# Step 4: Build and Train the Logistic Regression Model
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
lr_model = lr.fit(train)
print(lr_model.coefficients)

# Once the model is built, use the internal function summary, which offers details on ROC curve, precision, recall, AUC (area under the curve), etc.

[0.015744996668735654,-0.17573780180308202,0.19691855655264878,0.18864438020278904,0.6596088424547651,-0.25399285901439117,-0.30615723168760234,-1.3474636520260221,0.3610460400930601]


In [13]:
# Step 5: Evaluate Performance on Training Data

lr_summary=lr_model.summary
lr_summary.accuracy

0.6758638147069991

In [14]:
lr_summary.areaUnderROC

0.72024332048166

In [15]:
lr_summary.weightedRecall

0.6758638147069991

In [16]:
lr_summary.weightedPrecision

0.6766578664287813

In [17]:
# Using the summary function, we can view the model's performance on train data, such as its accuracy, AUC, weighted recall, and precision.
# How precision varies for various threshold values, the relation between precision and recall, and how recall varies with different threshold values

lr_summary.precisionByThreshold.show()

+------------------+------------------+
|         threshold|         precision|
+------------------+------------------+
|0.8687250425167268|               1.0|
|0.8678368087907256|               1.0|
| 0.866020437205675|               1.0|
|0.8566200353754436|               1.0|
|0.8527087664776976|0.8571428571428571|
| 0.851703859873189|             0.875|
|0.8513824008857376|0.8888888888888888|
|0.8507202476363105|               0.9|
|0.8497041654463837|0.9090909090909091|
|0.8493791383329399|0.9166666666666666|
|0.8476823285599637|0.8461538461538461|
|0.8446217297054671|               0.8|
|0.8435717703051641|            0.8125|
|0.8427279013125774|0.8235294117647058|
|0.8425441915289432|0.8333333333333334|
|0.8404441224901129|0.7894736842105263|
|0.8393713018775975|0.8095238095238095|
|0.8390281623615576|0.8181818181818182|
|0.8383214186248691|0.7916666666666666|
|0.8368902767434918|0.7857142857142857|
+------------------+------------------+
only showing top 20 rows



In [18]:
lr_summary.roc.show()

+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|                 0.0|2.501250625312656E-4|
|                 0.0|5.002501250625312E-4|
|                 0.0|0.001000500250125...|
|                 0.0|0.001250625312656...|
|2.562131693569049...|0.001500750375187...|
|2.562131693569049...|0.001750875437718...|
|2.562131693569049...|0.002001000500250125|
|2.562131693569049...|0.002251125562781...|
|2.562131693569049...|0.002501250625312...|
|2.562131693569049...|0.002751375687843922|
|5.124263387138099E-4|0.002751375687843922|
|7.686395080707148E-4|0.003001500750375...|
|7.686395080707148E-4|0.003251625812906...|
|7.686395080707148E-4|0.003501750875437719|
|7.686395080707148E-4|0.003751875937968...|
|0.001024852677427...|0.003751875937968...|
|0.001024852677427...|0.004252126063031516|
|0.001024852677427...|0.004502251125562781|
|0.001281065846784...|0.00475237

In [19]:
lr_summary.recallByThreshold.show()

+------------------+--------------------+
|         threshold|              recall|
+------------------+--------------------+
|0.8687250425167268|2.501250625312656E-4|
|0.8678368087907256|5.002501250625312E-4|
| 0.866020437205675|0.001000500250125...|
|0.8566200353754436|0.001250625312656...|
|0.8527087664776976|0.001500750375187...|
| 0.851703859873189|0.001750875437718...|
|0.8513824008857376|0.002001000500250125|
|0.8507202476363105|0.002251125562781...|
|0.8497041654463837|0.002501250625312...|
|0.8493791383329399|0.002751375687843922|
|0.8476823285599637|0.002751375687843922|
|0.8446217297054671|0.003001500750375...|
|0.8435717703051641|0.003251625812906...|
|0.8427279013125774|0.003501750875437719|
|0.8425441915289432|0.003751875937968...|
|0.8404441224901129|0.003751875937968...|
|0.8393713018775975|0.004252126063031516|
|0.8390281623615576|0.004502251125562781|
|0.8383214186248691|0.004752376188094047|
|0.8368902767434918|0.005502751375687844|
+------------------+--------------

In [20]:
lr_summary.pr.show()

+--------------------+------------------+
|              recall|         precision|
+--------------------+------------------+
|                 0.0|               1.0|
|2.501250625312656E-4|               1.0|
|5.002501250625312E-4|               1.0|
|0.001000500250125...|               1.0|
|0.001250625312656...|               1.0|
|0.001500750375187...|0.8571428571428571|
|0.001750875437718...|             0.875|
|0.002001000500250125|0.8888888888888888|
|0.002251125562781...|               0.9|
|0.002501250625312...|0.9090909090909091|
|0.002751375687843922|0.9166666666666666|
|0.002751375687843922|0.8461538461538461|
|0.003001500750375...|               0.8|
|0.003251625812906...|            0.8125|
|0.003501750875437719|0.8235294117647058|
|0.003751875937968...|0.8333333333333334|
|0.003751875937968...|0.7894736842105263|
|0.004252126063031516|0.8095238095238095|
|0.004502251125562781|0.8181818181818182|
|0.004752376188094047|0.7916666666666666|
+--------------------+------------

In [21]:
# Step 6: Evaluate Performance on Test Data
model_predictions = lr_model.transform(test)
model_predictions.columns

['features', 'label', 'rawPrediction', 'probability', 'prediction']

In [22]:
model_predictions.select(['label', 'probability', 'prediction']).show(10, False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1    |[0.5038700430854873,0.49612995691451267]|0.0       |
|0    |[0.4881273558769128,0.5118726441230872] |1.0       |
|1    |[0.4881273558769128,0.5118726441230872] |1.0       |
|1    |[0.48419414262850874,0.5158058573714912]|1.0       |
|1    |[0.48026288651646115,0.5197371134835389]|1.0       |
|1    |[0.4763340729553839,0.5236659270446161] |1.0       |
|1    |[0.4763340729553839,0.5236659270446161] |1.0       |
|0    |[0.46848570886635427,0.5315142911336457]|1.0       |
|1    |[0.4645671221769144,0.5354328778230856] |1.0       |
|1    |[0.4645671221769144,0.5354328778230856] |1.0       |
+-----+----------------------------------------+----------+
only showing top 10 rows



In [23]:
# The prediction column shows the model prediction for each of the records in the test data. The probability column shows the values for both classes (0 & 1).
# The probability at 0th index is of 0; the other is for a prediction of 1. The evaluation of the logistic regression model on test data can be done using BinaryClassEvaluator.
# We can get the area under ROC and that under the PR curve

from pyspark.ml.evaluation import BinaryClassificationEvaluator
lr_evaluator = BinaryClassificationEvaluator(metricName = 'areaUnderROC')
lr_auroc = lr_evaluator.evaluate(model_predictions)
print(f'The auroc value of Logistic Regression Model is {lr_auroc}')

The auroc value of Logistic Regression Model is 0.7279289462373956


In [24]:
lr_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
lr_aupr = lr_evaluator.evaluate(model_predictions)
print(f'The aupr value of Logistic Regression Model is {lr_aupr}')

The aupr value of Logistic Regression Model is 0.7122810224807343


In [25]:
true_pos = model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()

In [26]:
actual_pos = model_predictions.filter(model_predictions['label']==1).count()
pred_pos = model_predictions.filter(model_predictions['prediction']==1).count()
# print(pred_pos)
#Recall
float(true_pos)/(actual_pos)

0.6622773044151821

In [27]:
# Precision
float(true_pos)/(pred_pos)

0.6956875508543532

# Decision Tree Classifier
DT can be used for classification as well as regression.
We are building a DT with default hyperparameters and use it to predict whether the use will opt for the new term deposit plan.



In [28]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt_model = dt.fit(train)
model_predictions = dt_model.transform(test)

In [29]:
model_predictions.select(['label','probability', 'prediction']).show(10,False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1    |[0.1912568306010929,0.8087431693989071] |1.0       |
|0    |[0.1912568306010929,0.8087431693989071] |1.0       |
|1    |[0.1912568306010929,0.8087431693989071] |1.0       |
|1    |[0.40208667736757625,0.5979133226324238]|1.0       |
|1    |[0.40208667736757625,0.5979133226324238]|1.0       |
|1    |[0.40208667736757625,0.5979133226324238]|1.0       |
|1    |[0.40208667736757625,0.5979133226324238]|1.0       |
|0    |[0.40208667736757625,0.5979133226324238]|1.0       |
|1    |[0.40208667736757625,0.5979133226324238]|1.0       |
|1    |[0.40208667736757625,0.5979133226324238]|1.0       |
+-----+----------------------------------------+----------+
only showing top 10 rows



In [30]:
# Evaluate Performance on Test Data
dt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
dt_auroc = dt_evaluator.evaluate(model_predictions)
print(f'The auc value of Decision Tree Classifier Model is {dt_auroc}')

The auc value of Decision Tree Classifier Model is 0.5596008901752911


In [31]:
dt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
dt_aupr = dt_evaluator.evaluate(model_predictions)
print(f'The aupr value of Decision Tree Model is {dt_aupr}')

The aupr value of Decision Tree Model is 0.6119662227113352


In [32]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()
actual_pos=model_predictions.filter(model_predictions['label']==1).count()
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()
float(true_pos)/(actual_pos)

0.627420604182804

In [33]:
float(true_pos)/(pred_pos)

0.7212822796081924

# Support Vector Machines Classifiers

In [34]:
# SVMs are used for classification tasks, as they find the hyperplane that maximizes the margin (perpendicular distance) between 2 classes.
# All the instances and target classes are represented as vectors in high-dimensional space, and the SVM finds the closest 2 points from the 2 classes that support the best separating line or hyperplane
# For nonlinearly separable data, there are different kernel tricks to separate the classes. In our example, we will build a linearly separable support vector classifier with default hyperparameters.

# Step 1: Build and train SVM Model
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC()
lsvc_model = lsvc.fit(train)
model_predictions = lsvc_model.transform(test)
model_predictions.columns

['features', 'label', 'rawPrediction', 'prediction']

In [35]:
model_predictions.select(['label','prediction']).show(10,False)

+-----+----------+
|label|prediction|
+-----+----------+
|1    |1.0       |
|0    |1.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
|1    |1.0       |
|0    |1.0       |
|1    |1.0       |
|1    |1.0       |
+-----+----------+
only showing top 10 rows



In [36]:
# Step 2: Evaluate performance on Test Data
svc_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
svc_auroc = svc_evaluator.evaluate(model_predictions)
print(f'The auc value of SupportVectorClassifier is {svc_auroc}')

The auc value of SupportVectorClassifier is 0.6911995027062203


In [37]:
svc_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
svc_aupr =svc_evaluator.evaluate(model_predictions)
print(f'The aupr value of SupportVectorClassifier Model is {svc_aupr}')

The aupr value of SupportVectorClassifier Model is 0.6747712879457272


In [38]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()
actual_pos=model_predictions.filter(model_predictions['label']==1).count()
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()
float(true_pos)/(actual_pos)

0.6491092176607282

In [39]:
float(true_pos)/(pred_pos)

0.7053872053872053

# Naive Bayes (NB)
classifiers work on the principle of conditional probability and assume absolute independence between predictors.
An NB classifier doesn't have many hyperparameters and can outperform some of the most sophisticated algorithms out there.
We will build an NB classifier and evaluate its performance on the test data.

In [40]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
nb_model = nb.fit(train)
model_predictions = nb_model.transform(test)
model_predictions.select(['label','probability', 'prediction']).show(10,False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1    |[0.5274075345725382,0.4725924654274618] |0.0       |
|0    |[0.5154264060557162,0.48457359394428373]|0.0       |
|1    |[0.5154264060557162,0.48457359394428373]|0.0       |
|1    |[0.5124278106404231,0.4875721893595768] |0.0       |
|1    |[0.5094283204408921,0.49057167955910785]|0.0       |
|1    |[0.5064281512379302,0.49357184876207]   |0.0       |
|1    |[0.5064281512379302,0.49357184876207]   |0.0       |
|0    |[0.5004266398605586,0.49957336013944137]|0.0       |
|1    |[0.497425729976803,0.502574270023197]   |1.0       |
|1    |[0.497425729976803,0.502574270023197]   |1.0       |
+-----+----------------------------------------+----------+
only showing top 10 rows



In [41]:
# Evaluate Performance on Test Data
nb_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
nb_auroc = nb_evaluator.evaluate(model_predictions)
print(f'The auc value of NB Classifier is {nb_auroc}')

The auc value of NB Classifier is 0.6518500397009442


In [42]:
nb_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
nb_aupr =nb_evaluator.evaluate(model_predictions)
print(f'The aupr value of NB Classifier Model is {nb_aupr}')

The aupr value of NB Classifier Model is 0.6088168824007848


In [43]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()
actual_pos=model_predictions.filter(model_predictions['label']==1).count()
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

In [44]:
float(true_pos)/(actual_pos)

0.691711851278079

In [45]:
float(true_pos)/(pred_pos)

0.6532553035844916

GRADIENT BOOSTED TREE CLASSIFIER

In [46]:
# So far, we have used single algorithms for classification. Now we move on to use ensemble methods, such as GBT and random forests, for classification.
# Bagging and boosting for classification works according to similar principles as regression.
# Build and Train the GBT Model
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier()
gbt_model = gbt.fit(train)
model_predictions = gbt_model.transform(test)
model_predictions.select(['label','probability','prediction']).show(10,False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1    |[0.18604586314784055,0.8139541368521594]|1.0       |
|0    |[0.418478264770028,0.581521735229972]   |1.0       |
|1    |[0.418478264770028,0.581521735229972]   |1.0       |
|1    |[0.4611895047763267,0.5388104952236733] |1.0       |
|1    |[0.4611895047763267,0.5388104952236733] |1.0       |
|1    |[0.4585978286203409,0.5414021713796591] |1.0       |
|1    |[0.4585978286203409,0.5414021713796591] |1.0       |
|0    |[0.49489768625908065,0.5051023137409194]|1.0       |
|1    |[0.5172297703699451,0.4827702296300549] |0.0       |
|1    |[0.5172297703699451,0.4827702296300549] |0.0       |
+-----+----------------------------------------+----------+
only showing top 10 rows



In [47]:
# Evaluate Performance on Test Data
gbt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
gbt_auroc = gbt_evaluator.evaluate(model_predictions)
print(f'The auc value of GradientBoostedTreesClassifier is {gbt_auroc}')

The auc value of GradientBoostedTreesClassifier is 0.736129558627489


In [48]:
gbt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
gbt_aupr = gbt_evaluator.evaluate(model_predictions)
print(f'The aupr value of GradientBoostedTreesClassifier Model is {gbt_aupr}')

The aupr value of GradientBoostedTreesClassifier Model is 0.7302461815267897


In [49]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()
actual_pos=model_predictions.filter(model_predictions['label']==1).count()
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

In [50]:
float(true_pos)/(actual_pos)

0.6297443841982959

In [51]:
float(true_pos)/(pred_pos)

0.7252453166815344

RANDOM FOREST CLASSIFIER

In [52]:
# A random forest classifier is a collection of multiple decision tree classifiers. It works on the voting mechanism and predicts the output class that received the maximum votes
# from all individual decision trees. Let's build a random forest classifier with the same data.
# Build and Train the RF Model
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=50,maxDepth=30)
rf_model = rf.fit(train)
model_predictions=rf_model.transform(test)
model_predictions.select(['label','probability','prediction']).show(10,False)

+-----+----------------------------------------+----------+
|label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1    |[0.29921126525714165,0.7007887347428584]|1.0       |
|0    |[0.37556289232739,0.62443710767261]     |1.0       |
|1    |[0.37556289232739,0.62443710767261]     |1.0       |
|1    |[0.49054549329879504,0.509454506701205] |1.0       |
|1    |[0.5097428210123173,0.4902571789876826] |0.0       |
|1    |[0.5097428210123173,0.4902571789876826] |0.0       |
|1    |[0.5097428210123173,0.4902571789876826] |0.0       |
|0    |[0.5120652754639928,0.48793472453600706]|0.0       |
|1    |[0.5186683516408506,0.4813316483591494] |0.0       |
|1    |[0.5186683516408506,0.4813316483591494] |0.0       |
+-----+----------------------------------------+----------+
only showing top 10 rows



In [53]:
# Evaluate Performance on Test Data
rf_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
rf_auroc = rf_evaluator.evaluate(model_predictions)
print(f'The auc value of RandomForestClassifier Model is {rf_auroc}')

The auc value of RandomForestClassifier Model is 0.7359085931296735


In [54]:
rf_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')

In [55]:
rf_aupr = rf_evaluator.evaluate(model_predictions)

In [56]:
print(f'The aupr value of RandomForestClassifier Model is {rf_aupr}')

The aupr value of RandomForestClassifier Model is 0.7347958358890551


In [57]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()
actual_pos=model_predictions.filter(model_predictions['label']==1).count()
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

In [58]:
float(true_pos)/(actual_pos)

0.6227730441518203

In [59]:
float(true_pos)/(pred_pos)

0.7236723672367237

# We have used default hyperparameters for all the models but will rarely perform to their best. Therefore, we must tune them for the right combination of hyperparameters.

Hyperparameter Tuning and Cross-Validation by considering the RF model that was just built.

In [63]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
rf = RandomForestClassifier()
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [5,10,20,25,30])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20,50,100])
             .build())
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=rf_evaluator, numFolds=5)

In [64]:
cv_model = cv.fit(train)
best_rf_model = cv_model.bestModel

In [65]:
model_predictions = best_rf_model.transform(test)
rf_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
rf_auroc = rf_evaluator.evaluate(model_predictions)
print(rf_auroc)

0.7414239283780406


In [None]:
# By using the best hyperparameters for our random forest model, the AUC score has increased from 0.7359 to 0.7414

Different types of supervised learning have been experimented and ways to solve binary classification with multiple ML algorithms. Hyperparamters and cross-validation techniques to build the best possble model.