<a href="https://colab.research.google.com/github/ralsouza/apache_spark_real_time_analytics/blob/master/notebooks/09_pyspark_mllib_random_forest_with_dimensionality_reduction_and_string_indexer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spark MLLib - Classification - Random Forest
Description:
*   One of the most popular;
*   It's an Ensemble Method algorithm;
*   The Random Forest algorithm creates many models and each model is used to predict outcomes individually. A vote is made by Random Forest to pick the best model;

Advantages:
*   Usually offers the best performance 
*   Efficient with many predict variables
*   Works well in parallelized way
*   Excellent with missing values

Disadvangates:
* Slower
* BIAS can be occur frequently

Application:
* Scientific research;
* Medical diagnostic;





# Setup

In [None]:
!apt-get update

In [1]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
# Environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [3]:
# Make pyspark "importable"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [4]:
# Libraries and Context Setup
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [5]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)


# Instance Spark Session
spark = SparkSession.builder.master('local').appName('spark_ml_lib').getOrCreate()

# Create the SQL Context
sqlContext = pyspark.SQLContext(sc)

# Business Problem
### Classify customers according to the possibility of paying the credit or not.

# Libraries

In [6]:
import math
from pyspark.ml.linalg         import Vectors
from pyspark.sql               import Row
from pyspark.ml.feature        import StringIndexer
from pyspark.ml.feature        import PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation     import MulticlassClassificationEvaluator

In [7]:
# Create SparkSession to work with Dataframes on the Spark
sp_session = SparkSession.builder.master('local').appName('spark_mllib_app').getOrCreate()

In [8]:
rdd_bank = sc.textFile('/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/mllib/bank.csv')

In [9]:
rdd_bank.cache()

/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/mllib/bank.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [10]:
rdd_bank.count()

542

In [11]:
rdd_bank.take(5)

['"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"',
 '30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"',
 '33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"yes"',
 '35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"yes"',
 '30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"yes"']

# Data Cleansing

In [12]:
# Removing the header by the first row
first_row = rdd_bank.first()

In [13]:
rdd_bank2 = rdd_bank.filter(lambda x: x != first_row)

In [14]:
rdd_bank2.count()

541

In [29]:
# Transform string values to numeric values
def transform_to_numeric(input_str):

  att_list = input_str.replace("\"","").split(';')

  age       = float( att_list[0])
  outcome   = 0.0 if att_list[16] == 'no'        else 1.0
  single    = 1.0 if att_list[2]  == 'single'    else 0.0
  married   = 1.0 if att_list[2]  == 'married'   else 0.0
  divorced  = 1.0 if att_list[2]  == 'divorced'  else 0.0
  primary   = 1.0 if att_list[3]  == 'primary'   else 0.0
  secondary = 1.0 if att_list[3]  == 'secondary' else 0.0
  tertiary  = 1.0 if att_list[3]  == 'tertiary'  else 0.0
  default   = 0.0 if att_list[4]  == 'no'        else 1.0
  balance   = float( att_list[5])
  loan      = 0.0 if att_list[7]  == 'no'        else 1.0

  # Create rows with transformed objects
  rows = Row(
             OUTCOME = outcome  ,AGE = age           ,SINGLE = single      ,
             MARRIED = married  ,DIVORCED = divorced ,SECONDARY = secondary, 
             TERTIARY = tertiary,DEFAULT = default   ,BALANCE = balance    ,
             LOAN = loan
             )
  
  return rows

In [30]:
# Apply the function
rdd_bank3 = rdd_bank2.map(transform_to_numeric)

In [41]:
rdd_bank3.collect()[:15]

[Row(AGE=30.0, BALANCE=1787.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=1.0, OUTCOME=0.0, SECONDARY=0.0, SINGLE=0.0, TERTIARY=0.0),
 Row(AGE=33.0, BALANCE=4789.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=1.0, MARRIED=1.0, OUTCOME=1.0, SECONDARY=1.0, SINGLE=0.0, TERTIARY=0.0),
 Row(AGE=35.0, BALANCE=1350.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=0.0, OUTCOME=1.0, SECONDARY=0.0, SINGLE=1.0, TERTIARY=1.0),
 Row(AGE=30.0, BALANCE=1476.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=1.0, MARRIED=1.0, OUTCOME=1.0, SECONDARY=0.0, SINGLE=0.0, TERTIARY=1.0),
 Row(AGE=59.0, BALANCE=0.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=1.0, OUTCOME=0.0, SECONDARY=1.0, SINGLE=0.0, TERTIARY=0.0),
 Row(AGE=35.0, BALANCE=747.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=0.0, OUTCOME=1.0, SECONDARY=0.0, SINGLE=1.0, TERTIARY=1.0),
 Row(AGE=36.0, BALANCE=307.0, DEFAULT=0.0, DIVORCED=0.0, LOAN=0.0, MARRIED=1.0, OUTCOME=1.0, SECONDARY=0.0, SINGLE=0.0, TERTIARY=1.0),
 Row(AGE=39.0, BALANCE=147.0, DEFAULT=0.0, DIVORCED=0

# Exploratoy Data Analysis

In [42]:
# Transform to dataframe
df_bank = sp_session.createDataFrame(rdd_bank3)

In [43]:
# Descritive analysis
df_bank.describe().show()

+-------+------------------+------------------+--------------------+-------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+
|summary|               AGE|           BALANCE|             DEFAULT|           DIVORCED|               LOAN|           MARRIED|            OUTCOME|         SECONDARY|            SINGLE|          TERTIARY|
+-------+------------------+------------------+--------------------+-------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+
|  count|               541|               541|                 541|                541|                541|               541|                541|               541|               541|               541|
|   mean| 41.26987060998152|1444.7818853974122|0.022181146025878003|0.10905730129390019|0.16266173752310537|0.6155268022181146| 0.3974121996303142|0.4953789279112754|0.275415896487

In [47]:
# Measure data correlation
for i in df_bank.columns:
  if not(isinstance(df_bank.select(i).take(1)[0][0],str)):
    print('Correlation with OUTCOME: ', i, df_bank.stat.corr('OUTCOME',i))

Correlation with OUTCOME:  AGE -0.1823210432736525
Correlation with OUTCOME:  BALANCE 0.036574866119976804
Correlation with OUTCOME:  DEFAULT -0.04536965206737378
Correlation with OUTCOME:  DIVORCED -0.07812659940926987
Correlation with OUTCOME:  LOAN -0.030420586112717318
Correlation with OUTCOME:  MARRIED -0.3753241299133561
Correlation with OUTCOME:  OUTCOME 1.0
Correlation with OUTCOME:  SECONDARY 0.026392774894072973
Correlation with OUTCOME:  SINGLE 0.46323284934360515
Correlation with OUTCOME:  TERTIARY 0.08494840766635618


# Data pre-processing

In [None]:
# 04:48