<a href="https://colab.research.google.com/github/ralsouza/apache_spark_real_time_analytics/blob/master/notebooks/09_pyspark_mllib_random_forest_with_dimensionality_reduction_and_string_indexer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spark MLLib - Classification - Random Forest
Description:
*   One of the most popular;
*   It's an Ensemble Method algorithm;
*   The Random Forest algorithm creates many models and each model is used to predict outcomes individually. A vote is made by Random Forest to pick the best model;

Advantages:
*   Usually offers the best performance 
*   Efficient with many predict variables
*   Works well in parallelized way
*   Excellent with missing values

Disadvangates:
* Slower
* BIAS can be occur frequently

Application:
* Scientific research;
* Medical diagnostic;





# Setup

In [None]:
!apt-get update

In [None]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
# Environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [None]:
# Make pyspark "importable"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [None]:
# Libraries and Context Setup
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [None]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)


# Instance Spark Session
spark = SparkSession.builder.master('local').appName('spark_ml_lib').getOrCreate()

# Create the SQL Context
sqlContext = pyspark.SQLContext(sc)

# Business Problem
### Classify customers according to the possibility of paying the credit or not.

# Libraries

In [None]:
import math
from pyspark.ml.linalg         import Vectors
from pyspark.sql               import Row
from pyspark.ml.feature        import StringIndexer
from pyspark.ml.feature        import PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation     import MulticlassClassificationEvaluator

In [None]:
# Create SparkSession to work with Dataframes on the Spark
sp_session = SparkSession.builder.master('local').appName('spark_mllib_app').getOrCreate()

In [None]:
rdd_bank = sc.textFile('/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/mllib/bank.csv')

In [None]:
rdd_bank.cache()

/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/mllib/bank.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [None]:
rdd_bank.count()

542

In [None]:
rdd_bank.take(5)

['"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"',
 '30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"',
 '33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"yes"',
 '35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"yes"',
 '30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"yes"']

# Data Cleansing

In [None]:
# Removing the header by the first row
first_row = rdd_bank.first()

In [None]:
rdd_bank2 = rdd_bank.filter(lambda x: x != first_row)

In [None]:
rdd_bank2.count()

541

In [None]:
# Transform string values to numeric values
def transform_to_numeric(input_str):

  att_list = input_str.replace("\"","").split(';')

  age       = float( att_list[0])
  outcome   = 0.0 if att_list[16] == 'no'        else 1.0
  single    = 1.0 if att_list[2]  == 'single'    else 0.0
  married   = 1.0 if att_list[2]  == 'married'   else 0.0
  divorced  = 1.0 if att_list[2]  == 'divorced'  else 0.0
  primary   = 1.0 if att_list[3]  == 'primary'   else 0.0
  secondary = 1.0 if att_list[3]  == 'secondary' else 0.0
  tertiary  = 1.0 if att_list[3]  == 'tertiary'  else 0.0
  default   = 0.0 if att_list[4]  == 'no'        else 1.0
  balance   = float( att_list[5])
  loan      = 0.0 if att_list[7]  == 'no'        else 1.0

  # Create rows with transformed objects
  rows = Row(
             OUTCOME = outcome     ,AGE = age           ,SINGLE = single,
             MARRIED = married     ,DIVORCED = divorced ,PRIMARY = primary,
             SECONDARY = secondary ,TERTIARY = tertiary ,DEFAULT = default,
             BALANCE = balance     ,LOAN = loan
             )
  
  return rows

In [None]:
# Apply the function
rdd_bank3 = rdd_bank2.map(transform_to_numeric)

In [None]:
rdd_bank3.collect()[:15]

[Row(AGE=30.0, BALANCE=1787.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=1.0, OUTCOME=0.0, PRIMARY=1.0, SECONDARY=0.0, SINGLE=0.0, TERTIARY=0.0),
 Row(AGE=33.0, BALANCE=4789.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=1.0, MARRIED=1.0, OUTCOME=1.0, PRIMARY=0.0, SECONDARY=1.0, SINGLE=0.0, TERTIARY=0.0),
 Row(AGE=35.0, BALANCE=1350.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=0.0, OUTCOME=1.0, PRIMARY=0.0, SECONDARY=0.0, SINGLE=1.0, TERTIARY=1.0),
 Row(AGE=30.0, BALANCE=1476.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=1.0, MARRIED=1.0, OUTCOME=1.0, PRIMARY=0.0, SECONDARY=0.0, SINGLE=0.0, TERTIARY=1.0),
 Row(AGE=59.0, BALANCE=0.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=1.0, OUTCOME=0.0, PRIMARY=0.0, SECONDARY=1.0, SINGLE=0.0, TERTIARY=0.0),
 Row(AGE=35.0, BALANCE=747.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=0.0, OUTCOME=1.0, PRIMARY=0.0, SECONDARY=0.0, SINGLE=1.0, TERTIARY=1.0),
 Row(AGE=36.0, BALANCE=307.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=1.0, OUTCOME=1.0, PRIMARY=0.0, SECO

# Exploratoy Data Analysis

In [None]:
# Transform to dataframe
df_bank = sp_session.createDataFrame(rdd_bank3)

In [None]:
# Descritive analysis
df_bank.describe().show()

+-------+------------------+------------------+--------------------+-------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|summary|               AGE|           BALANCE|             DEFAULT|           DIVORCED|               LOAN|           MARRIED|            OUTCOME|           PRIMARY|         SECONDARY|            SINGLE|          TERTIARY|
+-------+------------------+------------------+--------------------+-------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|  count|               541|               541|                 541|                541|                541|               541|                541|               541|               541|               541|               541|
|   mean| 41.26987060998152|1444.7818853974122|0.022181146025878003|0.10905730129390019|0.16266173752310

In [None]:
# Measure data correlation
for i in df_bank.columns:
  if not(isinstance(df_bank.select(i).take(1)[0][0],str)):
    print('Correlation with OUTCOME: ', i, df_bank.stat.corr('OUTCOME',i))

Correlation with OUTCOME:  AGE -0.1823210432736525
Correlation with OUTCOME:  BALANCE 0.03657486611997681
Correlation with OUTCOME:  DEFAULT -0.04536965206737378
Correlation with OUTCOME:  DIVORCED -0.07812659940926987
Correlation with OUTCOME:  LOAN -0.030420586112717318
Correlation with OUTCOME:  MARRIED -0.3753241299133561
Correlation with OUTCOME:  OUTCOME 1.0
Correlation with OUTCOME:  PRIMARY -0.12561548832677982
Correlation with OUTCOME:  SECONDARY 0.026392774894072973
Correlation with OUTCOME:  SINGLE 0.46323284934360515
Correlation with OUTCOME:  TERTIARY 0.08494840766635618


# Data pre-processing

In [None]:
# Create a LabeledPoint (target,vector[features])
# We will use a sparse vector, because de zeros and ones are the same proportion
# in the dataset
def transform_var(row):
  obj = (row['OUTCOME'],Vectors.dense([row['AGE']    ,row['BALANCE'],
                                       row['DEFAULT'],row['DIVORCED'],
                                       row['LOAN']   ,row['MARRIED'],
                                       row['PRIMARY'],row['SECONDARY'],
                                       row['SINGLE'] ,row['TERTIARY']
                                       ]))
  
  return obj

In [None]:
rdd_bank4 = df_bank.rdd.map(transform_var)

In [None]:
rdd_bank4.collect()[:15]

[(0.0, DenseVector([30.0, 1787.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0])),
 (1.0, DenseVector([33.0, 4789.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (1.0, DenseVector([35.0, 1350.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0])),
 (1.0, DenseVector([30.0, 1476.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (0.0, DenseVector([59.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (1.0, DenseVector([35.0, 747.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0])),
 (1.0, DenseVector([36.0, 307.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (0.0, DenseVector([39.0, 147.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (0.0, DenseVector([41.0, 221.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (1.0, DenseVector([43.0, -88.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0])),
 (0.0, DenseVector([39.0, 9374.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (0.0, DenseVector([43.0, 264.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0])),
 (0.0, DenseVector([36.0, 1109.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (1.0, D

In [None]:
# Convert to dataframe again
df_bank = sp_session.createDataFrame(rdd_bank4,['label','features'])

In [None]:
# show results
df_bank.select('features','label').show(10)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[30.0,1787.0,0.0,...|  0.0|
|[33.0,4789.0,0.0,...|  1.0|
|[35.0,1350.0,0.0,...|  1.0|
|[30.0,1476.0,0.0,...|  1.0|
|[59.0,0.0,0.0,0.0...|  0.0|
|[35.0,747.0,0.0,0...|  1.0|
|[36.0,307.0,0.0,0...|  1.0|
|[39.0,147.0,0.0,0...|  0.0|
|[41.0,221.0,0.0,0...|  0.0|
|[43.0,-88.0,0.0,0...|  1.0|
+--------------------+-----+
only showing top 10 rows



## Applying PCA

In [None]:
pca_bank = PCA(k=3, inputCol='features', outputCol='pca_features')
pca_model = pca_bank.fit(df_bank)
pca_result = pca_model.transform(df_bank).select('label','pca_features')

In [None]:
pca_result.show(truncate=False)

+-----+------------------------------------------------------------+
|label|pca_features                                                |
+-----+------------------------------------------------------------+
|0.0  |[-1787.018897197381,28.86209683775489,-0.06459982604876296] |
|1.0  |[-4789.020177138492,29.922562636340885,-0.9830243513096447] |
|1.0  |[-1350.022213163262,34.10110809796657,0.8951427168301616]   |
|1.0  |[-1476.0189517184556,29.051333993596376,0.3952723868021922] |
|0.0  |[-0.037889185366455545,58.9897182000177,-0.729079238366194] |
|1.0  |[-747.0223377634923,34.488291981817554,0.9045654956970024]  |
|1.0  |[-307.0230691022592,35.799850539655154,0.5170631523785959]  |
|0.0  |[-147.0250121617634,38.90107856650326,-0.8069627548799431]  |
|0.0  |[-221.0262985348787,40.853633675694894,0.53730363658032]    |
|1.0  |[87.9723868768871,43.06265944115107,-0.06701642871171626]   |
|0.0  |[-9374.023105550941,32.9764588379908,-0.9511484606914545]   |
|0.0  |[-264.02755731528384,42.824

In [None]:
# Indexing is a prerequisite to decision trees
string_indexer = StringIndexer(inputCol='label',outputCol='indexed_label')
si_model = string_indexer.fit(pca_result)
obj_final = si_model.transform(pca_result)

In [None]:
# Show results
obj_final.collect()[:10]

[Row(label=0.0, pca_features=DenseVector([-1787.0189, 28.8621, -0.0646]), indexed_label=0.0),
 Row(label=1.0, pca_features=DenseVector([-4789.0202, 29.9226, -0.983]), indexed_label=1.0),
 Row(label=1.0, pca_features=DenseVector([-1350.0222, 34.1011, 0.8951]), indexed_label=1.0),
 Row(label=1.0, pca_features=DenseVector([-1476.019, 29.0513, 0.3953]), indexed_label=1.0),
 Row(label=0.0, pca_features=DenseVector([-0.0379, 58.9897, -0.7291]), indexed_label=0.0),
 Row(label=1.0, pca_features=DenseVector([-747.0223, 34.4883, 0.9046]), indexed_label=1.0),
 Row(label=1.0, pca_features=DenseVector([-307.0231, 35.7999, 0.5171]), indexed_label=1.0),
 Row(label=0.0, pca_features=DenseVector([-147.025, 38.9011, -0.807]), indexed_label=0.0),
 Row(label=0.0, pca_features=DenseVector([-221.0263, 40.8536, 0.5373]), indexed_label=0.0),
 Row(label=1.0, pca_features=DenseVector([87.9724, 43.0627, -0.067]), indexed_label=1.0)]

# Machine Learning

In [None]:
# Data splitting
(train_data,test_data) = obj_final.randomSplit([0.7,0.3])

In [None]:
train_data.count()

381

In [None]:
test_data.count()

160

In [None]:
# Create the model with train data
rf_classifier = RandomForestClassifier(labelCol='indexed_label',
                                       featuresCol='pca_features')

model = rf_classifier.fit(train_data)

In [None]:
# Predictions with test data
predictions = model.transform(test_data)

In [None]:
# Show predicted data
predictions.collect()[:10]

[Row(label=0.0, pca_features=DenseVector([-16873.0325, 45.1565, -1.0041]), indexed_label=0.0, rawPrediction=DenseVector([11.4637, 8.5363]), probability=DenseVector([0.5732, 0.4268]), prediction=0.0),
 Row(label=0.0, pca_features=DenseVector([-11494.0342, 49.61, -0.9162]), indexed_label=0.0, rawPrediction=DenseVector([10.7458, 9.2542]), probability=DenseVector([0.5373, 0.4627]), prediction=0.0),
 Row(label=0.0, pca_features=DenseVector([-8104.0336, 49.7873, -0.8708]), indexed_label=0.0, rawPrediction=DenseVector([13.3873, 6.6127]), probability=DenseVector([0.6694, 0.3306]), prediction=0.0),
 Row(label=0.0, pca_features=DenseVector([-7190.0255, 37.3733, 0.7344]), indexed_label=0.0, rawPrediction=DenseVector([12.0834, 7.9166]), probability=DenseVector([0.6042, 0.3958]), prediction=0.0),
 Row(label=0.0, pca_features=DenseVector([-7082.0351, 52.4544, -0.0453]), indexed_label=0.0, rawPrediction=DenseVector([13.0756, 6.9244]), probability=DenseVector([0.6538, 0.3462]), prediction=0.0),
 Row(l

In [None]:
# Accuracy evaluation
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='indexed_label',metricName='accuracy')

In [None]:
evaluator.evaluate(predictions)

0.66875

In [None]:
# Confusion matrix
predictions.groupBy('indexed_label','prediction').count().show()

+-------------+----------+-----+
|indexed_label|prediction|count|
+-------------+----------+-----+
|          1.0|       1.0|   28|
|          0.0|       1.0|   10|
|          1.0|       0.0|   43|
|          0.0|       0.0|   79|
+-------------+----------+-----+

