# Final Project Task 10 - RandomForest classification
- Read in the parquet file you created as part of Task 3.

- Convert the parquet file to CSV format.

- Load the CSV file into a dataframe

- Create a 80-20 training and test split with seed=1.

- Train a Random Forest model with different hyperparameters listed below and report the best performing hyperparameter combinations.

### Read in the parquet file you created as part of Task 3.
Converts a parquet file to CSV file with header using ApacheSpark

In [1]:
%%bash
export version=`python --version |awk '{print $2}' |awk -F"." '{print $1$2}'`

echo $version

if [ $version == '36' ] || [ $version == '37' ]; then
    echo 'Starting installation...'
    pip3 install pyspark==2.4.8 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
elif [ $version == '38' ] || [ $version == '39' ]; then
    pip3 install pyspark==3.1.2 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log
    if [ $? == 0 ]; then
        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'
    else
        echo 'Installation failed, please check log:'
        cat install.log
    fi
else
    echo 'Currently only python 3.6, 3.7 , 3.8 and 3.9 are supported, in case you need a different version please open an issue at https://github.com/IBM/claimed/issues'
    exit -1
fi

37
Starting installation...
Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)


In [2]:
# @param data_dir temporal data storage for local execution
# @param data_csv csv path and file name (default: data.csv)
# @param data_parquet path and parquet file name (default: data.parquet)
# @param master url of master (default: local mode)

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import shutil
import glob

In [15]:
data_csv = os.environ.get('data_csv', 'data.csv')
data_parquet = os.environ.get('data_parquet', 'data.parquet')
master = os.environ.get('master', "local[*]")
data_dir = os.environ.get('data_dir', '../../data/')

In [7]:
skip = False
if os.path.exists(data_dir + data_csv):
    skip = True

In [8]:
if not skip:
    sc = SparkContext.getOrCreate(SparkConf().setMaster(master))
    spark = SparkSession.builder.getOrCreate()

21/12/18 12:34:49 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/18 12:34:51 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/12/18 12:34:51 WARN util.Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
21/12/18 12:34:51 WARN util.Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [9]:
if not skip:
    df = spark.read.parquet(data_dir + data_parquet)

                                                                                

### Convert the parquet file to CSV format.

In [10]:
if not skip:
    if os.path.exists(data_dir + data_csv):
        shutil.rmtree(data_dir + data_csv)
    df.coalesce(1).write.option("header", "true").csv(data_dir + data_csv)
    file = glob.glob(data_dir + data_csv + '/part-*')
    shutil.move(file[0], data_dir + data_csv + '.tmp')
    shutil.rmtree(data_dir + data_csv)
    shutil.move(data_dir + data_csv + '.tmp', data_dir + data_csv)

                                                                                

### Load the CSV file into a dataframe

In [41]:
from pyspark import SparkContext, SparkConf, SQLContext
import os
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark2pmml import PMMLBuilder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
import logging
import shutil
#import sitexv
import sys
import wget
import re

In [12]:
if sys.version[0:3] == '3.9':
    url = ('https://github.com/jpmml/jpmml-sparkml/releases/download/1.7.2/'
           'jpmml-sparkml-executable-1.7.2.jar')
    wget.download(url)
    shutil.copy('jpmml-sparkml-executable-1.7.2.jar',
                site.getsitepackages()[0] + '/pyspark/jars/')
elif sys.version[0:3] == '3.8':
    url = ('https://github.com/jpmml/jpmml-sparkml/releases/download/1.7.2/'
           'jpmml-sparkml-executable-1.7.2.jar')
    wget.download(url)
    shutil.copy('jpmml-sparkml-executable-1.7.2.jar',
                site.getsitepackages()[0] + '/pyspark/jars/')
elif sys.version[0:3] == '3.7':
    url = ('https://github.com/jpmml/jpmml-sparkml/releases/download/1.5.12/'
           'jpmml-sparkml-executable-1.5.12.jar')
    wget.download(url)
elif sys.version[0:3] == '3.6':
    url = ('https://github.com/jpmml/jpmml-sparkml/releases/download/1.5.12/'
           'jpmml-sparkml-executable-1.5.12.jar')
    wget.download(url)
else:
    raise Exception('Currently only python 3.6 , 3.7, 3,8 and 3.9 is supported, in case '
                    'you need a different version please open an issue at '
                    'https://github.com/IBM/claimed/issues')

In [14]:
master = os.environ.get('master',
                        "local[*]")  # URL to Spark master
model_target = os.environ.get('model_target',
                              "model.xml")  # model output file name
data_dir = os.environ.get('data_dir',
                          '../../data/')  # temporary directory for data
input_columns = os.environ.get('input_columns',
                               '["x", "y", "z"]')  # input columns to consider

In [17]:
parameters = list(
    map(lambda s: re.sub('$', '"', s),
        map(
            lambda s: s.replace('=', '="'),
            filter(
                lambda s: s.find('=') > -1 and bool(re.match(r'[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)),
                sys.argv
            )
    )))

for parameter in parameters:
    logging.warning('Parameter: ' + parameter)
    exec(parameter)

In [18]:
conf = SparkConf().setMaster(master)
#if sys.version[0:3] == '3.6' or sys.version[0:3] == '3.7':
conf.set("spark.jars", 'jpmml-sparkml-executable-1.5.12.jar')

sc = SparkContext.getOrCreate(conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession

In [30]:
df = spark.read.option("delimiter", ",").option("header", "true").csv(data_dir + data_csv)

In [31]:
df.show()

+---+---+---+--------------------+--------+
|  x|  y|  z|              source|   class|
+---+---+---+--------------------+--------+
| 33| 36| 51|Accelerometer-201...|Eat_meat|
| 33| 36| 51|Accelerometer-201...|Eat_meat|
| 33| 35| 53|Accelerometer-201...|Eat_meat|
| 31| 37| 52|Accelerometer-201...|Eat_meat|
| 32| 36| 52|Accelerometer-201...|Eat_meat|
| 32| 36| 51|Accelerometer-201...|Eat_meat|
| 32| 36| 51|Accelerometer-201...|Eat_meat|
| 33| 36| 53|Accelerometer-201...|Eat_meat|
| 33| 35| 52|Accelerometer-201...|Eat_meat|
| 33| 36| 52|Accelerometer-201...|Eat_meat|
| 32| 35| 53|Accelerometer-201...|Eat_meat|
| 33| 36| 52|Accelerometer-201...|Eat_meat|
| 32| 38| 53|Accelerometer-201...|Eat_meat|
| 32| 37| 52|Accelerometer-201...|Eat_meat|
| 33| 35| 52|Accelerometer-201...|Eat_meat|
| 32| 36| 53|Accelerometer-201...|Eat_meat|
| 32| 36| 53|Accelerometer-201...|Eat_meat|
| 32| 36| 52|Accelerometer-201...|Eat_meat|
| 34| 36| 52|Accelerometer-201...|Eat_meat|
| 33| 36| 52|Accelerometer-201..

### Create a 80-20 training and test split with seed=1

In [32]:
# register a corresponding query table
df.createOrReplaceTempView('df')

In [33]:
from pyspark.sql.types import DoubleType
df = df.withColumn("x", df.x.cast(DoubleType()))
df = df.withColumn("y", df.y.cast(DoubleType()))
df = df.withColumn("z", df.z.cast(DoubleType()))

In [34]:
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [35]:
indexer = StringIndexer(inputCol="class", outputCol="label")

vectorAssembler = VectorAssembler(inputCols=eval(input_columns),
                                  outputCol="features")

normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")

In [36]:
df.select("class").groupby("class").count().show()



+--------------+-----+
|         class|count|
+--------------+-----+
| Use_telephone|15225|
| Standup_chair|25417|
|      Eat_meat|31236|
|     Getup_bed|45801|
|   Drink_glass|42792|
|    Pour_water|41673|
|     Comb_hair|23504|
|          Walk|92254|
|  Climb_stairs|40258|
| Sitdown_chair|25036|
|   Liedown_bed|11446|
|Descend_stairs|15375|
|   Brush_teeth|29829|
|      Eat_soup| 6683|
+--------------+-----+



                                                                                

In [37]:
df_train.select("class").groupby("class").count().show()

                                                                                

+--------------+-----+
|         class|count|
+--------------+-----+
| Use_telephone|12209|
| Standup_chair|20430|
|      Eat_meat|25049|
|     Getup_bed|36603|
|   Drink_glass|34298|
|    Pour_water|33468|
|     Comb_hair|18739|
|          Walk|73652|
|  Climb_stairs|32253|
| Sitdown_chair|20008|
|   Liedown_bed| 9099|
|Descend_stairs|12369|
|   Brush_teeth|23883|
|      Eat_soup| 5322|
+--------------+-----+



                                                                                

In [38]:
df_test.select("class").groupby("class").count().show()

                                                                                

+--------------+-----+
|         class|count|
+--------------+-----+
| Use_telephone| 3016|
| Standup_chair| 4987|
|      Eat_meat| 6187|
|     Getup_bed| 9198|
|   Drink_glass| 8494|
|    Pour_water| 8205|
|     Comb_hair| 4765|
|          Walk|18602|
|  Climb_stairs| 8005|
| Sitdown_chair| 5028|
|   Liedown_bed| 2347|
|Descend_stairs| 3006|
|   Brush_teeth| 5946|
|      Eat_soup| 1361|
+--------------+-----+



### Train a Random Forest model with different hyperparameters listed below and report the best performing hyperparameter combinations.
### Use the accuracy metric when evaluating the model with different hyperparameters 

In [55]:
# Defining hyperparameters values
numTrees = [10, 20]
maxDepth = [5, 7]

In [56]:
# empty dictionary where hyperparameter and sccuracy combinations (27 in total) will be stored on every iteration
dict_hyper = {"numTrees" : [], "maxDepth" : [], "accuracy" : []}

In [57]:
iteration = 1

for i in numTrees:
    for j in maxDepth:
        
        # Printing current hyperparameters values
        print("Combination " + str(iteration))
        print("numTrees: " + str(i))
        print("maxDepth: " + str(j))
        
        # Storing current hyperparameters values
        dict_hyper["numTrees"].append(i)
        dict_hyper["maxDepth"].append(j)
        
        # Defining model, pipeline and getting accuracy
        rf = RandomForestClassifier(numTrees=i, maxDepth=j, labelCol="label", seed=1)
        pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer, rf])
        model = pipeline.fit(df_train)
        prediction = model.transform(df_train)
        binEval = MulticlassClassificationEvaluator(). \
            setMetricName("accuracy"). \
            setPredictionCol("prediction"). \
            setLabelCol("label")
        acc_temp = binEval.evaluate(prediction)
        
        # Appending accuracy result
        dict_hyper["accuracy"].append(acc_temp)
        print("accuracy: " + str(acc_temp))
        iteration += 1
        print()
        print()

Combination 1
numTrees: 10
maxDepth: 5


                                                                                

accuracy: 0.43895327688579727


Combination 2
numTrees: 10
maxDepth: 7


                                                                                

accuracy: 0.4615341567286545


Combination 3
numTrees: 20
maxDepth: 5


                                                                                

accuracy: 0.44216552596381464


Combination 4
numTrees: 20
maxDepth: 7




accuracy: 0.46758650407687014




                                                                                

In [58]:
import pandas as pd

In [59]:
# Creating DataFrame using pandas
df2 = pd.DataFrame.from_dict(dict_hyper)

In [60]:
# Converting pandas DataFrame to spark DataFrame
df2 = spark.createDataFrame(df2) 

In [61]:
df2.show()

[Stage 252:=====>                                                 (1 + 10) / 11]

+--------+--------+-------------------+
|numTrees|maxDepth|           accuracy|
+--------+--------+-------------------+
|      10|       5|0.43895327688579727|
|      10|       7| 0.4615341567286545|
|      20|       5|0.44216552596381464|
|      20|       7|0.46758650407687014|
+--------+--------+-------------------+



                                                                                

In [62]:
from pyspark.sql.functions import desc

In [63]:
df2.sort(desc('accuracy')).show(1)

[Stage 253:>                                                      (0 + 16) / 16]

+--------+--------+-------------------+
|numTrees|maxDepth|           accuracy|
+--------+--------+-------------------+
|      20|       7|0.46758650407687014|
+--------+--------+-------------------+
only showing top 1 row



                                                                                