# Resample DataFrame

In [1]:
!pkill -9 java

In [2]:
from code.common import *

In [3]:
import time
import os
import pprint
from pyspark import SparkContext, SQLContext

In [4]:
for file in os.listdir('data'): 
    if 'packed' in file:
        print(file)

train.parquet.normed.filled.masked-60000.encode.picked-987.packed
valid.parquet.normed.filled.masked-60000.encode.picked-987.packed
tests.parquet.normed.filled.masked-60000.encode.picked-987.packed


### Resample the Train data

In [6]:
from pyspark import SparkContext, SQLContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

#### Using Masked 100 Train Data

In [5]:
train = sqlContext.read.parquet('data/criteo.parquet.df.train.normed.filled.masked-100.encode.packed')

In [6]:
neg_count = train.filter(train['label']==0).count()
neg_count

27275676

In [7]:
pos_count = train.filter(train['label']==1).count()
pos_count

9394851

In [8]:
total_count = pos_count + neg_count
print(f'Total number of train examples is {total_count}')
print (f'Percentage of negative examples is {neg_count*100/total_count}')
print (f'Percentage of postive examples is {pos_count*100/total_count}')

Total number of train examples is 36670527
Percentage of negative examples is 74.38037637146584
Percentage of postive examples is 25.619623628534164


### Oversampling

In [9]:
# Number of extra data points
extra = neg_count - pos_count
print(f'Extra data points = {extra}')

# Resample rate
resample_rate = extra/pos_count
resample_rate
print(f'The resample rate is {resample_rate}')

Extra data points = 17880825
The resample rate is 1.90325796545363


In [10]:
extra_positives = train.filter(train['label']==1).sample(withReplacement=True, fraction=1.9)

In [11]:
extra_positives.show(5)

+-----+--------------------+------------------+
|label|            features|            weight|
+-----+--------------------+------------------+
|  1.0|(160342,[0,1,2,3,...|0.7438037637146584|
|  1.0|(160342,[0,1,2,4,...|0.7438037637146584|
|  1.0|(160342,[0,1,2,3,...|0.7438037637146584|
|  1.0|(160342,[0,1,2,3,...|0.7438037637146584|
|  1.0|(160342,[0,1,2,4,...|0.7438037637146584|
+-----+--------------------+------------------+
only showing top 5 rows



In [12]:
oversampled_train = train.union(extra_positives)
oversampled_train = oversampled_train.sample(withReplacement = False, fraction = 1.0)

In [13]:
sampled_neg_count = oversampled_train.filter(oversampled_train['label']==0).count()
sampled_pos_count = oversampled_train.filter(oversampled_train['label']==1).count()

sampled_total_count = sampled_pos_count + sampled_neg_count
print(f'Total number of train examples is {sampled_total_count}')
print (f'Percentage of negative examples is {sampled_neg_count*100/sampled_total_count}')
print (f'Percentage of postive examples is {sampled_pos_count*100/sampled_total_count}')

Total number of train examples is 54520340
Percentage of negative examples is 50.0284407617414
Percentage of postive examples is 49.9715592382586


In [14]:
oversampled_train.select('features').first().features.size

160342

In [15]:
del extra_positives, train

##### Save the Oversampled df

In [16]:
oversampled_train.write.parquet('data/criteo.parquet.df.train.normed.filled.masked-100.encode.packed.oversampled')

##### Training the model without weighting

In [5]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation     import BinaryClassificationEvaluator

In [19]:
estimator = LogisticRegression(featuresCol='features', labelCol='label', maxIter = 10, regParam = 0.0, family = 'binomial')

In [20]:
evaluator = BinaryClassificationEvaluator()

In [46]:
start_train = time.time()
model = estimator.fit(oversampled_train)
end_train = time.time()

In [53]:
print(f'Time to train is {(end_train-start_train)/60}')

Time to train is 9.488627183437348


##### Testing the model on Dev Set

In [21]:
dev = sqlContext.read.parquet('data/criteo.parquet.df.dev.normed.filled.masked-100.encode.packed')

In [57]:
# Make Evaluations
start_prediction = time.time()
transformed_train = model.transform(oversampled_train)
transformed_dev = model.transform(dev)
end_prediction = time.time()

In [56]:
print(f'Time to predict is {(end_prediction - start_prediction)/60}')

Time to predict is 0.002859099706013997


In [58]:
auc_train = evaluator.evaluate(transformed_train)
auc_dev = evaluator.evaluate(transformed_dev)

In [60]:
print(f'Logistic Regression - AUC on train is: {auc_train * 100:.2f}')
print(f'Logistic Regression - AUC on dev is: {auc_dev * 100:.2f}')

Logistic Regression - AUC on train is: 78.43
Logistic Regression - AUC on dev is: 78.04


### Use Regularization and Cross Validation

In [4]:
oversampled_train = sqlContext.read.parquet('data/criteo.parquet.df.train.normed.filled.masked-100.encode.packed.oversampled')

AnalysisException: 'Path does not exist: file:/home/jovyan/work/book/data/criteo.parquet.df.train.normed.filled.masked-100.encode.packed.oversampled;'

In [7]:
dev = sqlContext.read.parquet('data/criteo.parquet.df.dev.normed.filled.masked-100.encode.packed')

In [8]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation     import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [7]:
estimator = LogisticRegression(featuresCol='features', labelCol='label', maxIter = 10, regParam = 0.01, family = 'binomial')
evaluator = BinaryClassificationEvaluator()

In [11]:
start_train = time.time()
model = estimator.fit(oversampled_train)
end_train = time.time()

In [17]:
regularization_values = [0.01, 0.1, 0.5, 1.0]

start_reg_search = time.time()

for reg_value in regularization_values:
    
    estimator = LogisticRegression(featuresCol='features', labelCol='label', 
                                   maxIter = 10, regParam = reg_value, family = 'binomial')
    
    model = estimator.fit(oversampled_train)
    
    # Make Evaluations

    transformed_train = model.transform(oversampled_train)
    transformed_dev = model.transform(dev)
    end_prediction = time.time()

    # Make Evaluations
    transformed_train = model.transform(oversampled_train)
    transformed_dev = model.transform(dev)


    # Get the AUC
    auc_train = evaluator.evaluate(transformed_train)
    auc_dev = evaluator.evaluate(transformed_dev)

    # Print the AUC
    print(f'Logistic Regression - Regulaization: {reg_value} - AUC on train is: {auc_train * 100:.2f}')
    print(f'Logistic Regression - Regularization: {reg_value} - AUC on dev is: {auc_dev * 100:.2f}')
    
end_reg_search = time.time()

print(f'Completed Regulaization parameter search in {(end_reg_search - start_reg_search)/60} minutes')

Logistic Regression - Regulaization: 0.01 - AUC on train is: 78.13
Logistic Regression - Regularization: 0.01 - AUC on dev is: 77.80
Logistic Regression - Regulaization: 0.1 - AUC on train is: 77.82
Logistic Regression - Regularization: 0.1 - AUC on dev is: 77.55
Logistic Regression - Regulaization: 0.5 - AUC on train is: 76.86
Logistic Regression - Regularization: 0.5 - AUC on dev is: 76.66
Logistic Regression - Regulaization: 1.0 - AUC on train is: 76.23
Logistic Regression - Regularization: 1.0 - AUC on dev is: 76.06
Completed Regulaization parameter search in 14.312608154614766 minutes


In [18]:
paramGrid = (ParamGridBuilder()
             .addGrid(estimator.regParam, [0.01, 0.1, 1.0])
             #.addGrid(estimator.maxIter, [5, 10])
             .build())

In [None]:
cv = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# train using the crossvalidator
start_cv_train = time.time()
cvModel = cv.fit(oversampled_train)
end_cv_train = time.time()

In [None]:
print(f'Training with the cross validator took {(end_cv_train - start_cv_train)/60):.2f} seconds')

In [None]:
# Make Predictions on train and dev
train_predictions = cvModel.transform(oversampled_train)
test_predictions = cvModel.transform(dev)

# Calculate the AUC for train and dev
auc_train = evaluator.evaluate(train_predictions)
auc_dev = evaluator.evaluate(test_predictions)

print(f'Logistic Regression - AUC on train is: {auc_train * 100:.2f}')
print(f'Logistic Regression - AUC on dev is: {auc_dev * 100:.2f}')

#### Using Random Forests

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

start_train = time.time()
model = rf.fit(oversampled_train)
end_train = time.time()

evaluator = BinaryClassificationEvaluator()

In [None]:
train_predictions = model.transform(oversampled_data)
test_predictions = model.transform(dev)
end_predictions = time.time()

# Calculate the AUC for train and dev
auc_train = evaluator.evaluate(train_predictions)
auc_dev = evaluator.evaluate(test_predictions)

print(f'Random Forests - AUC on train is: {auc_train * 100:.2f}')
print(f'Random Forests - AUC on dev is: {auc_dev * 100:.2f}')
print(f'Time from training to prediction is {(end_predictions-start_train)/60:.2f} minutes')

### Undersampling

In [5]:
train = sqlContext.read.parquet('data/criteo.parquet.df.train.normed.filled.masked-100.encode.packed')

neg_count = train.filter(train['label']==0).count()

pos_count = train.filter(train['label']==1).count()

In [6]:
# Resample rate
resample_rate = pos_count/neg_count
resample_rate
print(f'The resample rate is {resample_rate}')

The resample rate is 0.34444062907918394


In [7]:
new_negatives = train.filter(train['label']==0).sample(withReplacement=False, fraction =0.344)

In [13]:
undersampled_train = train.filter(train['label']==1).union(new_negatives)
undersampled_train = undersampled_train.sample(withReplacement = False, fraction = 1.0)

In [14]:
sampled_neg_count = undersampled_train.filter(train['label']==0).count()
sampled_pos_count = undersampled_train.filter(train['label']==1).count()

sampled_total_count = sampled_pos_count + sampled_neg_count
print(f'Total number of train examples is {sampled_total_count}')
print (f'Percentage of negative examples is {sampled_neg_count*100/sampled_total_count}')
print (f'Percentage of postive examples is {sampled_pos_count*100/sampled_total_count}')

Total number of train examples is 18773665
Percentage of negative examples is 49.95728857418091
Percentage of postive examples is 50.04271142581909


In [9]:
undersampled_train.write.parquet('data/criteo.parquet.df.train.normed.filled.masked-100.encode.packed.undersampled')

In [15]:
undersampled_train.cache()
del train

#### Training the model

In [6]:
dev = sqlContext.read.parquet('data/criteo.parquet.df.dev.normed.filled.masked-100.encode.packed')

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [16]:
estimator = LogisticRegression(featuresCol='features', labelCol='label', maxIter = 10, regParam = 0.0, family = 'binomial')

evaluator = BinaryClassificationEvaluator()


start_train = time.time()
model = estimator.fit(undersampled_train)
end_train = time.time()

In [18]:
# Make Evaluations
start_prediction = time.time()
transformed_train = model.transform(undersampled_train)
transformed_dev = model.transform(dev)
end_prediction = time.time()

auc_train = evaluator.evaluate(transformed_train)
auc_dev = evaluator.evaluate(transformed_dev)


print(f'Logistic Regression - AUC on train is: {auc_train * 100:.2f}')
print(f'Logistic Regression - AUC on dev is: {auc_dev * 100:.2f}')
print(f'Completed in {(end_prediction - start_train)/60:.2f} minutes')

Logistic Regression - AUC on train is: 78.19
Logistic Regression - AUC on dev is: 77.76
Completed in 3.24 minutes


#### Random Forests

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

start_train = time.time()
model = rf.fit(undersampled_train)
end_train = time.time()

In [None]:
train_predictions = model.transform(oversampled_data)
test_predictions = model.transform(dev)
end_predictions = time.time()

# Calculate the AUC for train and dev
auc_train = evaluator.evaluate(train_predictions)
auc_dev = evaluator.evaluate(test_predictions)

print(f'Random Forests - AUC on train is: {auc_train * 100:.2f}')
print(f'Random Forests - AUC on dev is: {auc_dev * 100:.2f}')
print(f'Time from training to prediction is {(end_predictions-start_train)/60:.2f} minutes')

### Using Evaluate function

In [5]:
train = sqlContext.read.parquet('data/criteo.parquet.df.train.normed.filled.masked-100.encode.packed')

#dev = sqlContext.read.parquet('data/criteo.parquet.df.dev.normed.filled.masked-100.encode.packed')

In [7]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

In [8]:
start_train = time.time()
model = rf.fit(train)
end_train = time.time()

print(f"Training took {(end_train - start_train)/60} minutes")

Py4JJavaError: An error occurred while calling o36.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 21 in stage 9.0 failed 1 times, most recent failure: Lost task 21.0 in stage 9.0 (TID 288, localhost, executor driver): java.io.IOException: No space left on device
	at sun.nio.ch.FileDispatcherImpl.write0(Native Method)
	at sun.nio.ch.FileDispatcherImpl.write(FileDispatcherImpl.java:60)
	at sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:93)
	at sun.nio.ch.IOUtil.write(IOUtil.java:65)
	at sun.nio.ch.FileChannelImpl.write(FileChannelImpl.java:211)
	at org.apache.spark.storage.CountingWritableChannel.write(DiskStore.scala:332)
	at java.nio.channels.Channels.writeFullyImpl(Channels.java:78)
	at java.nio.channels.Channels.writeFully(Channels.java:101)
	at java.nio.channels.Channels.access$000(Channels.java:61)
	at java.nio.channels.Channels$1.write(Channels.java:174)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:220)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:173)
	at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1877)
	at java.io.ObjectOutputStream$BlockDataOutputStream.writeByte(ObjectOutputStream.java:1915)
	at java.io.ObjectOutputStream.writeFatalException(ObjectOutputStream.java:1576)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:351)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:43)
	at org.apache.spark.serializer.SerializationStream.writeAll(Serializer.scala:140)
	at org.apache.spark.serializer.SerializerManager.dataSerializeStream(SerializerManager.scala:174)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1$$anonfun$apply$7.apply(BlockManager.scala:1174)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1$$anonfun$apply$7.apply(BlockManager.scala:1172)
	at org.apache.spark.storage.DiskStore.put(DiskStore.scala:69)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1172)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:567)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:201)
	at org.apache.spark.ml.classification.RandomForestClassifier$$anonfun$train$1.apply(RandomForestClassifier.scala:142)
	at org.apache.spark.ml.classification.RandomForestClassifier$$anonfun$train$1.apply(RandomForestClassifier.scala:120)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:120)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: No space left on device
	at sun.nio.ch.FileDispatcherImpl.write0(Native Method)
	at sun.nio.ch.FileDispatcherImpl.write(FileDispatcherImpl.java:60)
	at sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:93)
	at sun.nio.ch.IOUtil.write(IOUtil.java:65)
	at sun.nio.ch.FileChannelImpl.write(FileChannelImpl.java:211)
	at org.apache.spark.storage.CountingWritableChannel.write(DiskStore.scala:332)
	at java.nio.channels.Channels.writeFullyImpl(Channels.java:78)
	at java.nio.channels.Channels.writeFully(Channels.java:101)
	at java.nio.channels.Channels.access$000(Channels.java:61)
	at java.nio.channels.Channels$1.write(Channels.java:174)
	at java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
	at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:220)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:173)
	at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1877)
	at java.io.ObjectOutputStream$BlockDataOutputStream.writeByte(ObjectOutputStream.java:1915)
	at java.io.ObjectOutputStream.writeFatalException(ObjectOutputStream.java:1576)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:351)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:43)
	at org.apache.spark.serializer.SerializationStream.writeAll(Serializer.scala:140)
	at org.apache.spark.serializer.SerializerManager.dataSerializeStream(SerializerManager.scala:174)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1$$anonfun$apply$7.apply(BlockManager.scala:1174)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1$$anonfun$apply$7.apply(BlockManager.scala:1172)
	at org.apache.spark.storage.DiskStore.put(DiskStore.scala:69)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1172)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
