forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ArtifactManager.scala
305 lines (275 loc) · 9.73 KB
/
ArtifactManager.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.connect.client
import java.io.InputStream
import java.net.URI
import java.nio.file.{Files, Path, Paths}
import java.util.zip.{CheckedInputStream, CRC32}
import scala.collection.mutable
import scala.concurrent.Promise
import scala.concurrent.duration.Duration
import scala.util.control.NonFatal
import Artifact._
import com.google.protobuf.ByteString
import io.grpc.ManagedChannel
import io.grpc.stub.StreamObserver
import org.apache.spark.connect.proto
import org.apache.spark.connect.proto.AddArtifactsResponse
import org.apache.spark.connect.proto.AddArtifactsResponse.ArtifactSummary
import org.apache.spark.util.{ThreadUtils, Utils}
/**
* The Artifact Manager is responsible for handling and transferring artifacts from the local
* client to the server (local/remote).
* @param userContext
* @param channel
*/
class ArtifactManager(userContext: proto.UserContext, channel: ManagedChannel) {
// Using the midpoint recommendation of 32KiB for chunk size as specified in
// https://github.com/grpc/grpc.github.io/issues/371.
private val CHUNK_SIZE: Int = 32 * 1024
private[this] val stub = proto.SparkConnectServiceGrpc.newStub(channel)
/**
* Add a single artifact to the session.
*
* Currently only local files with extensions .jar and .class are supported.
*/
def addArtifact(path: String): Unit = {
addArtifact(Utils.resolveURI(path))
}
private def parseArtifacts(uri: URI): Seq[Artifact] = {
// Currently only local files with extensions .jar and .class are supported.
uri.getScheme match {
case "file" =>
val path = Paths.get(uri)
val artifact = path.getFileName.toString match {
case jar if jar.endsWith(".jar") =>
newJarArtifact(path.getFileName, new LocalFile(path))
case cf if cf.endsWith(".class") =>
newClassArtifact(path.getFileName, new LocalFile(path))
case other =>
throw new UnsupportedOperationException(s"Unsuppoted file format: $other")
}
Seq[Artifact](artifact)
case other =>
throw new UnsupportedOperationException(s"Unsupported scheme: $other")
}
}
/**
* Add a single artifact to the session.
*
* Currently only local files with extensions .jar and .class are supported.
*/
def addArtifact(uri: URI): Unit = addArtifacts(parseArtifacts(uri))
/**
* Add multiple artifacts to the session.
*
* Currently only local files with extensions .jar and .class are supported.
*/
def addArtifacts(uris: Seq[URI]): Unit = addArtifacts(uris.flatMap(parseArtifacts))
/**
* Add a number of artifacts to the session.
*/
private def addArtifacts(artifacts: Iterable[Artifact]): Unit = {
val promise = Promise[Seq[ArtifactSummary]]
val responseHandler = new StreamObserver[proto.AddArtifactsResponse] {
private val summaries = mutable.Buffer.empty[ArtifactSummary]
override def onNext(v: AddArtifactsResponse): Unit = {
v.getArtifactsList.forEach { summary =>
summaries += summary
}
}
override def onError(throwable: Throwable): Unit = {
promise.failure(throwable)
}
override def onCompleted(): Unit = {
promise.success(summaries.toSeq)
}
}
val stream = stub.addArtifacts(responseHandler)
val currentBatch = mutable.Buffer.empty[Artifact]
var currentBatchSize = 0L
def addToBatch(dep: Artifact, size: Long): Unit = {
currentBatch += dep
currentBatchSize += size
}
def writeBatch(): Unit = {
addBatchedArtifacts(currentBatch.toSeq, stream)
currentBatch.clear()
currentBatchSize = 0
}
artifacts.iterator.foreach { artifact =>
val data = artifact.storage
val size = data.size
if (size > CHUNK_SIZE) {
// Payload can either be a batch OR a single chunked artifact. Write batch if non-empty
// before chunking current artifact.
if (currentBatch.nonEmpty) {
writeBatch()
}
addChunkedArtifact(artifact, stream)
} else {
if (currentBatchSize + size > CHUNK_SIZE) {
writeBatch()
}
addToBatch(artifact, size)
}
}
if (currentBatch.nonEmpty) {
writeBatch()
}
stream.onCompleted()
ThreadUtils.awaitResult(promise.future, Duration.Inf)
// TODO(SPARK-42658): Handle responses containing CRC failures.
}
/**
* Add a batch of artifacts to the stream. All the artifacts in this call are packaged into a
* single [[proto.AddArtifactsRequest]].
*/
private def addBatchedArtifacts(
artifacts: Seq[Artifact],
stream: StreamObserver[proto.AddArtifactsRequest]): Unit = {
val builder = proto.AddArtifactsRequest
.newBuilder()
.setUserContext(userContext)
artifacts.foreach { artifact =>
val in = new CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32)
try {
val data = proto.AddArtifactsRequest.ArtifactChunk
.newBuilder()
.setData(ByteString.readFrom(in))
.setCrc(in.getChecksum.getValue)
builder.getBatchBuilder
.addArtifactsBuilder()
.setName(artifact.path.toString)
.setData(data)
.build()
} catch {
case NonFatal(e) =>
stream.onError(e)
throw e
} finally {
in.close()
}
}
stream.onNext(builder.build())
}
/**
* Read data from an [[InputStream]] in pieces of `chunkSize` bytes and convert to
* protobuf-compatible [[ByteString]].
* @param in
* @return
*/
private def readNextChunk(in: InputStream): ByteString = {
val buf = new Array[Byte](CHUNK_SIZE)
var bytesRead = 0
var count = 0
while (count != -1 && bytesRead < CHUNK_SIZE) {
count = in.read(buf, bytesRead, CHUNK_SIZE - bytesRead)
if (count != -1) {
bytesRead += count
}
}
if (bytesRead == 0) ByteString.empty()
else ByteString.copyFrom(buf, 0, bytesRead)
}
/**
* Add a artifact in chunks to the stream. The artifact's data is spread out over multiple
* [[proto.AddArtifactsRequest requests]].
*/
private def addChunkedArtifact(
artifact: Artifact,
stream: StreamObserver[proto.AddArtifactsRequest]): Unit = {
val builder = proto.AddArtifactsRequest
.newBuilder()
.setUserContext(userContext)
val in = new CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32)
try {
// First RPC contains the `BeginChunkedArtifact` payload (`begin_chunk`).
// Subsequent RPCs contains the `ArtifactChunk` payload (`chunk`).
val artifactChunkBuilder = proto.AddArtifactsRequest.ArtifactChunk.newBuilder()
var dataChunk = readNextChunk(in)
// Integer division that rounds up to the nearest whole number.
def getNumChunks(size: Long): Long = (size + (CHUNK_SIZE - 1)) / CHUNK_SIZE
builder.getBeginChunkBuilder
.setName(artifact.path.toString)
.setTotalBytes(artifact.size)
.setNumChunks(getNumChunks(artifact.size))
.setInitialChunk(
artifactChunkBuilder
.setData(dataChunk)
.setCrc(in.getChecksum.getValue))
stream.onNext(builder.build())
in.getChecksum.reset()
builder.clearBeginChunk()
dataChunk = readNextChunk(in)
// Consume stream in chunks until there is no data left to read.
while (!dataChunk.isEmpty) {
artifactChunkBuilder.setData(dataChunk).setCrc(in.getChecksum.getValue)
builder.setChunk(artifactChunkBuilder.build())
stream.onNext(builder.build())
in.getChecksum.reset()
builder.clearChunk()
dataChunk = readNextChunk(in)
}
} catch {
case NonFatal(e) =>
stream.onError(e)
throw e
} finally {
in.close()
}
}
}
class Artifact private (val path: Path, val storage: LocalData) {
require(!path.isAbsolute, s"Bad path: $path")
lazy val size: Long = storage match {
case localData: LocalData => localData.size
}
}
object Artifact {
val CLASS_PREFIX: Path = Paths.get("classes")
val JAR_PREFIX: Path = Paths.get("jars")
def newJarArtifact(fileName: Path, storage: LocalData): Artifact = {
newArtifact(JAR_PREFIX, ".jar", fileName, storage)
}
def newClassArtifact(fileName: Path, storage: LocalData): Artifact = {
newArtifact(CLASS_PREFIX, ".class", fileName, storage)
}
private def newArtifact(
prefix: Path,
requiredSuffix: String,
fileName: Path,
storage: LocalData): Artifact = {
require(!fileName.isAbsolute)
require(fileName.toString.endsWith(requiredSuffix))
new Artifact(prefix.resolve(fileName), storage)
}
/**
* Payload stored on this machine.
*/
sealed trait LocalData {
def stream: InputStream
def size: Long
}
/**
* Payload stored in a local file.
*/
class LocalFile(val path: Path) extends LocalData {
override def size: Long = Files.size(path)
override def stream: InputStream = Files.newInputStream(path)
}
}