## Set up the dataframe.

Read the data into a Rdd.

In [None]:
import findspark
findspark.init('/usr/local/bin/spark-2.0.1')

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

In [2]:
sc = spark.sparkContext
trainRdd = sc.textFile('input/tr.csv').cache()
testRdd = sc.textFile('input/test.csv').cache()

In [3]:
from pyspark.sql.types import *

def makeDF(rdd, data_set='train'):
    #Save the row in the rdd that contains the column names, then delete it from the rdd.
    header = rdd.first()
    header_rdd = rdd.filter(lambda l: 'margin' in l)
    rdd_no_header = rdd.subtract(header_rdd)

    #Prepare the schema and rdd to build the dataframe. 
    #Species will be a string while all columns will be floats.
    #Note if data_set == 'test', the won't have a species column
    fields = [StructField(field_name, FloatType(), True) for field_name in header.split(",")]   
    if data_set == 'train':
        fields[1].dataType = StringType()
        rdd_split = rdd_no_header.map(lambda l: l.split(",")).map(lambda l: [float(x) if i != 1 else str(x) for i,x in enumerate(l)])
    else:
        rdd_split = rdd_no_header.map(lambda l: l.split(",")).map(lambda l: [float(x) for i,x in enumerate(l)])

    schema=StructType(fields)
    
    return spark.createDataFrame(rdd_split, schema).cache()
    

In [4]:
train_df = makeDF(trainRdd)
test_df = makeDF(testRdd, data_set='test')

In [5]:
train_df.select('species').show()

+--------------------+
|             species|
+--------------------+
|Magnolia_Salicifolia|
|Betula_Austrosine...|
|  Tilia_Platyphyllos|
|     Ilex_Aquifolium|
|         Acer_Pictum|
|  Quercus_Imbricaria|
|   Quercus_Agrifolia|
|    Acer_Saccharinum|
|Quercus_Semecarpi...|
|    Cornus_Chinensis|
|     Acer_Capillipes|
|     Quercus_Pontica|
|      Viburnum_Tinus|
|         Sorbus_Aria|
|Liriodendron_Tuli...|
|Quercus_Infectori...|
|      Quercus_Texana|
|Quercus_Phillyrae...|
|   Quercus_Agrifolia|
|   Cotinus_Coggygria|
+--------------------+
only showing top 20 rows



## Prepare the data for training.

Label encode the target column in the training set.

In [6]:
from pyspark.ml.feature import IndexToString, StringIndexer
stInd = StringIndexer(inputCol='species', outputCol='speciesEnc')
stIndModel = stInd.fit(train_df)
train_df_enc = stIndModel.transform(train_df).cache()
train_df_enc.select('species', 'speciesEnc').show()

+--------------------+----------+
|             species|speciesEnc|
+--------------------+----------+
|Magnolia_Salicifolia|      38.0|
|Betula_Austrosine...|       1.0|
|  Tilia_Platyphyllos|      87.0|
|     Ilex_Aquifolium|      22.0|
|         Acer_Pictum|      82.0|
|  Quercus_Imbricaria|      31.0|
|   Quercus_Agrifolia|      32.0|
|    Acer_Saccharinum|      79.0|
|Quercus_Semecarpi...|      28.0|
|    Cornus_Chinensis|      74.0|
|     Acer_Capillipes|      43.0|
|     Quercus_Pontica|      50.0|
|      Viburnum_Tinus|       8.0|
|         Sorbus_Aria|       0.0|
|Liriodendron_Tuli...|      73.0|
|Quercus_Infectori...|      23.0|
|      Quercus_Texana|      75.0|
|Quercus_Phillyrae...|      44.0|
|   Quercus_Agrifolia|      32.0|
|   Cotinus_Coggygria|      64.0|
+--------------------+----------+
only showing top 20 rows



Create the features column that will be used to train the model.

In [7]:
from pyspark.ml.feature import VectorAssembler
cols = train_df_enc.columns
cols.remove('species')
cols.remove('speciesEnc')
cols.remove('id')
va = VectorAssembler(inputCols=cols, outputCol="features")
test_df_vec = va.transform(test_df)
train_df_vec = va.transform(train_df_enc)



In [10]:
train_preds = model.transform(train_df_vec).cache()

In [11]:
train_preds = train_preds.select('species', 'speciesEnc', 'probability', 'prediction', 'id').cache()

In [12]:
ordered = train_preds.orderBy('speciesEnc')
ordered.show()

+--------------------+----------+--------------------+----------+------+
|             species|speciesEnc|         probability|prediction|    id|
+--------------------+----------+--------------------+----------+------+
|         Sorbus_Aria|       0.0|[0.32025702250384...|       0.0|1269.0|
|         Sorbus_Aria|       0.0|[0.27247980908622...|       0.0|1395.0|
|         Sorbus_Aria|       0.0|[0.26762364696292...|       0.0| 676.0|
|         Sorbus_Aria|       0.0|[0.26059767814940...|       0.0| 798.0|
|         Sorbus_Aria|       0.0|[0.27195400272790...|       0.0|1176.0|
|         Sorbus_Aria|       0.0|[0.24442803333658...|       0.0| 867.0|
|         Sorbus_Aria|       0.0|[0.32194640168715...|       0.0| 714.0|
|         Sorbus_Aria|       0.0|[0.28066739565063...|       0.0| 741.0|
|         Sorbus_Aria|       0.0|[0.31794566724273...|       0.0|1200.0|
|         Sorbus_Aria|       0.0|[0.35770505345511...|       0.0|1160.0|
|Betula_Austrosine...|       1.0|[2.89855072463768.

In [13]:
from pyspark.ml.feature import IndexToString
converter = IndexToString(inputCol="prediction", outputCol="predictedSpecies", labels=stIndModel.labels)
train_preds = converter.transform(train_preds)
train_preds.show()

+--------------------+----------+--------------------+----------+------+--------------------+
|             species|speciesEnc|         probability|prediction|    id|    predictedSpecies|
+--------------------+----------+--------------------+----------+------+--------------------+
|Magnolia_Salicifolia|      38.0|[0.00225803640496...|      38.0| 256.0|Magnolia_Salicifolia|
|Betula_Austrosine...|       1.0|[4.14078674948240...|       1.0| 488.0|Betula_Austrosine...|
|  Tilia_Platyphyllos|      87.0|[0.04214145977591...|      87.0| 239.0|  Tilia_Platyphyllos|
|     Ilex_Aquifolium|      22.0|[0.00884596880601...|      22.0| 600.0|     Ilex_Aquifolium|
|         Acer_Pictum|      82.0|[5.16099416355170...|      82.0|1417.0|         Acer_Pictum|
|  Quercus_Imbricaria|      31.0|[0.00168786949319...|      31.0| 497.0|  Quercus_Imbricaria|
|   Quercus_Agrifolia|      32.0|[0.0,0.0043520438...|      32.0| 387.0|   Quercus_Agrifolia|
|    Acer_Saccharinum|      79.0|[0.00309976178616...|      

In [21]:
from decimal import *
train_pred_rdd = train_preds.select('id', 'probability').rdd.map(lambda x: [int(x[0])] + [Decimal(a) for a in x[1]])
train_pred_df = spark.createDataFrame(train_pred_rdd, sol_schema).cache()

In [15]:
train_pred_df = train_pred_df.toPandas()

NameError: name 'train_pred_df' is not defined

In [None]:
train_pred_df[train_pred_df.id == 488.0]

In [22]:
from pyspark.mllib.regression import IsotonicRegression, IsotonicRegressionModel
from pyspark.sql import SQLContext
tuple_schema = StructType([
    StructField("label", IntegerType(), True),
    StructField("prob", DecimalType(), True)])

def createTuple(x):
    l = 1 if int(x[0]) == int(x[1]) else 0
    return (int(l), x[2], 1)
create_tup_udf = spark.udf.register('createTuple', createTuple, tuple_schema)

r = train_preds.select('speciesEnc', 'probability').rdd.flatMap(lambda x: [ (x[0], i,v) for i,v in enumerate(x[1])])
r = r.map(createTuple)

In [23]:
r.take(40)

[(0, 0.0022580364049624262, 1),
 (0, 0.00031055900621118014, 1),
 (0, 0.0010979210106167206, 1),
 (0, 0.025422172265558873, 1),
 (0, 0.00056618768666760679, 1),
 (0, 0.017757206421942853, 1),
 (0, 0.00028824149630102264, 1),
 (0, 0.0026821921449109187, 1),
 (0, 0.020860990651688057, 1),
 (0, 0.13887524335730353, 1),
 (0, 0.00033628872221720657, 1),
 (0, 0.0, 1),
 (0, 0.001091582681752436, 1),
 (0, 0.00072082645893713616, 1),
 (0, 0.0069026535994297887, 1),
 (0, 0.00039621601527950467, 1),
 (0, 0.065386414249853017, 1),
 (0, 0.01674034295690878, 1),
 (0, 0.0018911318150448585, 1),
 (0, 0.021462056489179072, 1),
 (0, 0.0047086607713002172, 1),
 (0, 0.00024844720496894411, 1),
 (0, 0.0008091803141903486, 1),
 (0, 0.0023828237068874501, 1),
 (0, 0.00030521907759759422, 1),
 (0, 0.0075682644293074653, 1),
 (0, 0.0096708196982019309, 1),
 (0, 0.001013107148171594, 1),
 (0, 0.00092806398806123485, 1),
 (0, 0.011955368002624821, 1),
 (0, 0.005523176094991763, 1),
 (0, 0.0059294681436197613, 1)

In [None]:
from pyspark.mllib.regression import IsotonicRegression, IsotonicRegressionModel
model = IsotonicRegression.train(r)




## Make predictions on the test set.

In [16]:
from pyspark.ml.classification import RandomForestClassificationModel
model = RandomForestClassificationModel.load('spark_random_forest_model_1')

In [17]:
prediction_df = model.transform(test_df_vec).cache()

In [18]:
species_list = stIndModel.labels
species_list.insert(0, 'id')

In [19]:
from decimal import *
pred_rdd = prediction_df.select('id', 'probability').rdd.map(lambda x: [int(x[0])] + [Decimal(a) for a in x[1]])

In [20]:
sol_fields = [StructField(field_name, DecimalType(38, 38), True) for field_name in species_list]
sol_fields[0].dataType = IntegerType()
sol_schema=StructType(sol_fields)    
pred_df = spark.createDataFrame(pred_rdd, sol_schema).cache()

In [60]:
import pandas as pd
pd_dataframe = pred_df.toPandas()
sorted_by_cols = pd_dataframe.sort_index(axis=1)
cols = list(sorted_by_cols.columns.values)
cols.remove('id')
cols.insert(0, 'id')
submission_df = sorted_by_cols[cols]


In [63]:
submission_df.head()

Unnamed: 0,id,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
0,653,0.0009497517295471,0.0001242236024844,0.0735485012515827,0.003666386014692,0.0,0.0003725175268127,0.0009708174491071,0.0006646538688359,0.0006149260308128,...,0.0229131095546451,0.0069712470993434,0.0024700045901298,0.0004865693785681,0.0009501522003061,0.0024922004671824,0.0069346084880999,0.0145347938285348,0.0002431610942249,0.0011532973653149
1,1415,0.0027626921966028,0.0164143680730762,0.0012179205982591,0.0053664126790505,0.0446424394319131,0.0282525781557218,0.0019119568888301,0.0012339017416354,0.0051963073805983,...,0.000906341941811,0.0047185981943132,0.0019363605107129,0.0035602801263632,0.0218211278908363,0.0118850296477178,0.0017289098191144,0.0024172170895701,0.0157532514433448,0.0042461130081604
2,736,0.0122107251170175,0.0060366066038544,0.0017225434174627,0.0147974642644026,0.0047238005291629,0.0011529134358831,0.0016130521477017,0.0251378368931531,0.019607916586033,...,0.000906341941811,0.0051449236767858,0.0269945681641919,0.1409176774865901,0.0739198912744781,0.0080437566175463,0.0049986328018336,0.003856370728564,0.0,0.0167794203562845
3,1193,0.009518701714868,0.0017635955869387,0.0018624035573229,0.01751085136808,0.0003002434912905,0.0021397166013696,0.0031082623638421,0.0203640721772074,0.0076401153371724,...,0.0013714582208808,0.0073311952266687,0.0176178881361809,0.0193821751939777,0.0072457942852889,0.0085249777175052,0.004812652399709,0.0051747564636376,0.0,0.0193963045529825
4,1139,0.0044971655991224,0.0013920006211037,0.0115061860781492,0.0132270212433855,0.0003002434912905,0.0022879194623384,0.0025985496775521,0.0025050359824569,0.0037029008692826,...,0.0025278948710269,0.0151765465349912,0.0039226631173986,0.0057208338104203,0.005279343442959,0.0093010828097284,0.0061725778136908,0.0987308483017898,0.0019739303249941,0.0058493002860076


Isotonic Regression
-figure out log loss.
-figure out isotonic regression.
-use log loss to see if isotonic regression is working
-Isotonic Regression takes tuples of values (label, feature, weight)
-label is equal to 1 or 0, based on if is is the predicted label.
-run isotonic regression. then split it up again.



In [64]:
submission_df.to_csv('spark_sol_2.csv', index=False)