## Set up the dataframe.

Read the data into a Rdd.

In [1]:
import findspark
findspark.init('/usr/local/bin/spark-2.0.1')

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

In [2]:
sc = spark.sparkContext
trainRdd = sc.textFile('input/tr.csv').cache()
testRdd = sc.textFile('input/test.csv').cache()

In [3]:
from pyspark.sql.types import *

def makeDF(rdd, data_set='train'):
    #Save the row in the rdd that contains the column names, then delete it from the rdd.
    header = rdd.first()
    header_rdd = rdd.filter(lambda l: 'margin' in l)
    rdd_no_header = rdd.subtract(header_rdd)

    #Prepare the schema and rdd to build the dataframe. 
    #Species will be a string while all columns will be floats.
    #Note if data_set == 'test', the won't have a species column
    fields = [StructField(field_name, FloatType(), True) for field_name in header.split(",")]   
    if data_set == 'train':
        fields[1].dataType = StringType()
        rdd_split = rdd_no_header.map(lambda l: l.split(",")).map(lambda l: [float(x) if i != 1 else str(x) for i,x in enumerate(l)])
    else:
        rdd_split = rdd_no_header.map(lambda l: l.split(",")).map(lambda l: [float(x) for i,x in enumerate(l)])

    schema=StructType(fields)
    
    return spark.createDataFrame(rdd_split, schema).cache()
    

In [4]:
train_df = makeDF(trainRdd)
test_df = makeDF(testRdd, data_set='test')

In [5]:
train_df.select('species').show()

+--------------------+
|             species|
+--------------------+
|Magnolia_Salicifolia|
|Betula_Austrosine...|
|  Tilia_Platyphyllos|
|     Ilex_Aquifolium|
|         Acer_Pictum|
|  Quercus_Imbricaria|
|   Quercus_Agrifolia|
|    Acer_Saccharinum|
|Quercus_Semecarpi...|
|    Cornus_Chinensis|
|     Acer_Capillipes|
|     Quercus_Pontica|
|      Viburnum_Tinus|
|         Sorbus_Aria|
|Liriodendron_Tuli...|
|Quercus_Infectori...|
|      Quercus_Texana|
|Quercus_Phillyrae...|
|   Quercus_Agrifolia|
|   Cotinus_Coggygria|
+--------------------+
only showing top 20 rows



## Prepare the data for training.

Label encode the target column in the training set.

In [6]:
from pyspark.ml.feature import IndexToString, StringIndexer
stInd = StringIndexer(inputCol='species', outputCol='speciesEnc')
stIndModel = stInd.fit(train_df)
train_df_enc = stIndModel.transform(train_df).cache()
train_df_enc.select('species', 'speciesEnc').show()

+--------------------+----------+
|             species|speciesEnc|
+--------------------+----------+
|Magnolia_Salicifolia|      38.0|
|Betula_Austrosine...|       1.0|
|  Tilia_Platyphyllos|      87.0|
|     Ilex_Aquifolium|      22.0|
|         Acer_Pictum|      82.0|
|  Quercus_Imbricaria|      31.0|
|   Quercus_Agrifolia|      32.0|
|    Acer_Saccharinum|      79.0|
|Quercus_Semecarpi...|      28.0|
|    Cornus_Chinensis|      74.0|
|     Acer_Capillipes|      43.0|
|     Quercus_Pontica|      50.0|
|      Viburnum_Tinus|       8.0|
|         Sorbus_Aria|       0.0|
|Liriodendron_Tuli...|      73.0|
|Quercus_Infectori...|      23.0|
|      Quercus_Texana|      75.0|
|Quercus_Phillyrae...|      44.0|
|   Quercus_Agrifolia|      32.0|
|   Cotinus_Coggygria|      64.0|
+--------------------+----------+
only showing top 20 rows



Create the features column that will be used to train the model.

In [7]:
from pyspark.ml.feature import VectorAssembler
cols = train_df_enc.columns
cols.remove('species')
cols.remove('speciesEnc')
cols.remove('id')
va = VectorAssembler(inputCols=cols, outputCol="features")
train_df_vec = va.transform(train_df_enc).cache()
test_df_vec = va.transform(test_df)
train_df_vec.select('features', 'speciesEnc').show()

+--------------------+----------+
|            features|speciesEnc|
+--------------------+----------+
|[0.06054700165987...|      38.0|
|[0.00195299996994...|       1.0|
|[0.00195299996994...|      87.0|
|[0.00585900014266...|      22.0|
|[0.00195299996994...|      82.0|
|[0.05078100040555...|      31.0|
|[0.01367199979722...|      32.0|
|[0.0,0.0,0.011718...|      79.0|
|[0.02734399959444...|      28.0|
|[0.03515600040555...|      74.0|
|[0.00195299996994...|      43.0|
|[0.00195299996994...|      50.0|
|[0.03710899874567...|       8.0|
|[0.0,0.0,0.009766...|       0.0|
|[0.03710899874567...|      73.0|
|[0.01757800020277...|      23.0|
|[0.0,0.0,0.111330...|      75.0|
|[0.02734399959444...|      44.0|
|[0.01171899959444...|      32.0|
|[0.04882799834012...|      64.0|
+--------------------+----------+
only showing top 20 rows



Split into train and test sets and fit the model.

In [8]:
train_set, test_set = train_df_vec.select('features', 'speciesEnc').randomSplit([.8, .2])

## Fit the model.

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf = RandomForestClassifier(labelCol="speciesEnc", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", featureSubsetStrategy="auto", impurity="gini", seed=None)

paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [5,10]) \
    .addGrid(rf.numTrees,  [10, 25, 50]) \
    .build()
    
    
evaluator = MulticlassClassificationEvaluator(labelCol="speciesEnc", predictionCol="prediction")
    
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

In [10]:
cvModel = crossval.fit(train_set)

In [11]:
cvModel.avgMetrics

[0.27966368267467295,
 0.5670334232123037,
 0.4285534179110245,
 0.7127659854318015,
 0.46722145922516967,
 0.7590163554178703]

In [18]:
cvModel.bestModel.save("spark_random_forest_model_1")

## Make predictions on the test set.

In [22]:
train_df_vec.select('species').distinct().show()

+--------------------+
|             species|
+--------------------+
|      Quercus_Afares|
|   Alnus_Sieboldiana|
| Arundinaria_Simonii|
|         Acer_Pictum|
|Quercus_Semecarpi...|
|Callicarpa_Bodinieri|
|   Quercus_Alnifolia|
|   Quercus_Shumardii|
|     Acer_Circinatum|
|     Fagus_Sylvatica|
|     Tilia_Tomentosa|
|   Cotinus_Coggygria|
|     Acer_Capillipes|
|Viburnum_x_Rhytid...|
|   Quercus_Coccifera|
|     Quercus_Pontica|
|Populus_Grandiden...|
|  Cornus_Controversa|
|    Acer_Saccharinum|
|   Prunus_X_Shmittii|
+--------------------+
only showing top 20 rows



In [12]:
prediction_df = cvModel.bestModel.transform(test_df_vec).cache()

In [30]:
species_labels = stIndModel.labels
#species_list = [row.species for row in species.collect()]
#species_list.insert(0, 'id')
species_labels.insert(0,'id')

In [14]:
from decimal import *
pred_rdd = prediction_df.select('id', 'probability').rdd.map(lambda x: [x[0]] + [Decimal(a) for a in x[1]])

In [31]:
sol_fields = [StructField(field_name, DecimalType(38, 38), True) for field_name in species_labels]
sol_fields[0].dataType = FloatType()
sol_schema=StructType(sol_fields)    
pred_df = spark.createDataFrame(pred_rdd, sol_schema).cache()

In [2]:
import pandas as pd
pd_dataframe = pred_df.toPandas()

NameError: name 'pred_df' is not defined