In [1]:
from pyspark.sql import SQLContext

from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler, Normalizer
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier


from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.tree import RandomForest, RandomForestModel

import os
from math import log

In [2]:
sqlContext = SQLContext(sc)

## Fonctions

In [3]:
# Log Loss metric
def logloss(df):
    loglossRed = df.map(lambda r: (r.indexedLabel, r.probability[1])) \
                   .map(lambda (y,p): y*log(p) + (1-y)*log(1-p)) \
                   .reduce(lambda a, b: a+b)
    return -1 * loglossRed / trainPredictions.count()

## Load

In [4]:
kaggleTrain = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('kaggle/train.csv')
kaggleTrain.cache()
print ""




In [5]:
kaggleTest = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('kaggle/test.csv')
kaggleTest.cache()
print ""




In [6]:
print "Kaggle Train count: " + str(kaggleTrain.count())
print "Kaggle Test count:  " + str(kaggleTest.count())

Kaggle Train count: 114321
Kaggle Test count:  114393


In [7]:
#print train.schema.fields
columnsDict = {}
for col in kaggleTrain.schema.fields:
    typeKey = str(col.dataType)
    colName = col.name
    
    if colName == 'ID':
        print "We have the ID columns, type: " + typeKey
        continue
    if colName == 'target':
        print "We have the target columns, type: " + typeKey
        continue
    
    if typeKey not in columnsDict:
        columnsDict[typeKey] = [col.name]
    else:
        columnsDict[typeKey].append(col.name)

print ""
for ct, cl in columnsDict.iteritems():
    print ct + " " + str(len(cl))

We have the ID columns, type: IntegerType
We have the target columns, type: IntegerType

StringType 19
DoubleType 108
IntegerType 4


In [8]:
kaggleTrain.stat.freqItems(["target"]).collect()

[Row(target_freqItems=[1, 0])]

In [9]:
kaggleTrain.stat.crosstab("target", "target").show()

+-------------+-----+-----+
|target_target|    0|    1|
+-------------+-----+-----+
|            1|    0|87021|
|            0|27300|    0|
+-------------+-----+-----+



## Split

In [10]:
# Split the data into train and test
splits = kaggleTrain.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

## Deal with missing values

In [11]:
trainWithoutNull = train.na.fill(0.5)
testWithoutNull = test.na.fill(0.5)
kaggleTestWithoutNull = kaggleTest.na.fill(0.5)

In [12]:
# Repartition
#trainWithoutNull = trainWithoutNull.repartition(20)
#testWithoutNull = testWithoutNull.repartition(20)

In [13]:
trainWithoutNull.rdd.getNumPartitions()

4

In [14]:
trainWithoutNull.cache()
testWithoutNull.cache()
kaggleTestWithoutNull.cache()

DataFrame[ID: int, v1: double, v2: double, v3: string, v4: double, v5: double, v6: double, v7: double, v8: double, v9: double, v10: double, v11: double, v12: double, v13: double, v14: double, v15: double, v16: double, v17: double, v18: double, v19: double, v20: double, v21: double, v22: string, v23: double, v24: string, v25: double, v26: double, v27: double, v28: double, v29: double, v30: string, v31: string, v32: double, v33: double, v34: double, v35: double, v36: double, v37: double, v38: int, v39: double, v40: double, v41: double, v42: double, v43: double, v44: double, v45: double, v46: double, v47: string, v48: double, v49: double, v50: double, v51: double, v52: string, v53: double, v54: double, v55: double, v56: string, v57: double, v58: double, v59: double, v60: double, v61: double, v62: int, v63: double, v64: double, v65: double, v66: string, v67: double, v68: double, v69: double, v70: double, v71: string, v72: int, v73: double, v74: string, v75: string, v76: double, v77: double

## Pipeline

In [15]:
# Create Label
labelIndexer = StringIndexer(inputCol="target", outputCol="indexedLabel")

In [16]:
# Create Feature vector
assembler = VectorAssembler(
    inputCols=columnsDict["DoubleType"],
    outputCol="features")

#output = assembler.transform(trainWithoutNull)
#output.schema
#trainFeat = trainWithoutNull.withColumn("label", trainWithoutNull.target.cast("Double"))

In [17]:
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
#featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)

In [18]:
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)

In [19]:
# Train a GBT model.
gbt = RandomForestClassifier(featuresCol="normFeatures", labelCol="indexedLabel", numTrees=20, maxDepth=20)
#gbt = DecisionTreeClassifier(featuresCol="normFeatures", labelCol="indexedLabel")
#gbt = GBTClassifier(featuresCol="normFeatures", labelCol="indexedLabel", maxIter=10)

In [20]:
# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[assembler, labelIndexer, normalizer, gbt])

In [21]:
# Train model.  This also runs the indexer.
model = pipeline.fit(trainWithoutNull)

ERROR:py4j.java_gateway:Error while sending or receiving.
Traceback (most recent call last):
  File "/home/scollot/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 746, in send_command
    raise Py4JError("Answer from Java side is empty")
Py4JError: Answer from Java side is empty
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
  File "/home/scollot/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 690, in start
    self.socket.connect((self.address, self.port))
  File "/home/scollot/anaconda/lib/python2.7/socket.py", line 228, in meth
    return getattr(self._sock,name)(*args)
error: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
  File "/home/scollot/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 690, in start
   

Py4JNetworkError: An error occurred while trying to connect to the Java server

## Evaluation

In [None]:
print model.stages[-1] # summary only

In [None]:
def evaluation(df):
    df.stat.crosstab("indexedLabel", "prediction").show()
    
    print df.select("prediction", "indexedLabel", "probability").take(3) # "rawPrediction"
    #print rainPredictions.select("prediction", "indexedLabel", "normFeatures").take(3)
    print ""
    
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    precision = evaluator.evaluate(df)
    print "Precision = %g" % (precision)


### Training

In [None]:
# Make predictions.
trainPredictions = model.transform(trainWithoutNull)

In [None]:
evaluation(trainPredictions)

In [None]:
print "Logloss on Training: " + str(logloss(trainPredictions))

### Testing

In [None]:
# Make predictions.
testPredictions = model.transform(testWithoutNull)

In [None]:
evaluation(testPredictions)

In [None]:
print "Logloss on Testing: " + str(logloss(testPredictions))

## Make prediction and save

In [None]:
predictions = model.transform(kaggleTestWithoutNull)

In [None]:
predictions.stat.crosstab("prediction", "prediction").show()

In [None]:
predictions.select("ID", "prediction").take(3) # "probability", "rawPrediction"

In [None]:
outputFile = "results/prediction.csv"
os.system("rm -rf " + outputFile)
predictions.select("ID", "prediction").withColumnRenamed("prediction", "PredictedProb").repartition(1).write.format('com.databricks.spark.csv').option("header", "true").save(outputFile)