## Set up the dataframe.

Read the data into a Rdd.

In [3]:
import findspark
findspark.init('/usr/local/bin/spark-2.0.1')

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

In [2]:
sc = spark.sparkContext
trainRdd = sc.textFile('input/tr.csv').cache()
testRdd = sc.textFile('input/test.csv').cache()

Create the dataframe.

In [3]:
from pyspark.sql.types import *

def makeDF(rdd, data_set='train'):
    #Save the row in the rdd that contains the column names, then delete it from the rdd.
    header = rdd.first()
    header_rdd = rdd.filter(lambda l: 'margin' in l)
    rdd_no_header = rdd.subtract(header_rdd)

    #Prepare the schema and rdd to build the dataframe. 
    #Species will be a string while all columns will be floats.
    #Note if data_set == 'test', the won't have a species column
    fields = [StructField(field_name, FloatType(), True) for field_name in header.split(",")]   
    if data_set == 'train':
        fields[1].dataType = StringType()
        rdd_split = rdd_no_header.map(lambda l: l.split(",")).map(lambda l: [float(x) if i != 1 else str(x) for i,x in enumerate(l)])
    else:
        rdd_split = rdd_no_header.map(lambda l: l.split(",")).map(lambda l: [float(x) for i,x in enumerate(l)])

    schema=StructType(fields)
    
    return spark.createDataFrame(rdd_split, schema).cache()
    

In [4]:
train_df = makeDF(trainRdd)
test_df = makeDF(testRdd, data_set='test')

## Fit the model using a pipeline.

In [10]:
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline


# Stage to encode our target variable.
stInd = StringIndexer(inputCol='species', outputCol='speciesEnc')


cols = train_df.columns
cols.remove('species')
cols.remove('id')
# Stage to create a single feature column for the Spark API.
va = VectorAssembler(inputCols=cols, outputCol="features")

#Simple random forest model.
rf = RandomForestClassifier(labelCol="speciesEnc", 
                            predictionCol="prediction",
                            probabilityCol="probability",
                            rawPredictionCol="rawPrediction",
                            numTrees=10,
                            featureSubsetStrategy="auto")

pipeline = Pipeline(stages=[stInd, va, rf])
pipe_model = pipeline.fit(train_df)

Use the pipeline to make predictions for the test set.

In [11]:
prediction_df = pipe_model.transform(test_df).cache() 

In [19]:
species_labels = pipe_model.stages[0].labels
species_labels.insert(0,'id')

In [20]:
from decimal import *
pred_rdd = prediction_df.select('id', 'probability').rdd.map(lambda x: [x[0]] + [Decimal(a) for a in x[1]])

In [21]:
sol_fields = [StructField(field_name, DecimalType(38, 38), True) for field_name in species_labels]
sol_fields[0].dataType = FloatType()
sol_schema=StructType(sol_fields)    
pred_df = spark.createDataFrame(pred_rdd, sol_schema).cache()