In [47]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
      .master("local[1]") \
      .appName("NORD_Task") \
      .getOrCreate() 
sc = spark.sparkContext




hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
region = 'eu-central-1'
hadoop_conf.set("fs.s3a.endpoint", f"s3.{region}.amazonaws.com")
# see http://blog.encomiabile.it/2015/10/29/apache-spark-amazon-s3-and-apache-mesos/
# hadoop_conf.set("fs.s3a.connection.maximum", "100000")


#number of files to process - will be read as input
N = 100

### Note 
In general I can see 2 approaches to load files data:
   - Approach 1.
     - `spark.read.format('binaryFile').option("pathGlobFilter","<path-glob>").load(<s3-bucket>)`. This solution would read all files with metadata into single DataFrame (path, mod time,  length, content)
     - parse content of file in apropriate resulted dataframe transformation
   - The advantage of it is that you receive parallelized DataFrame, content of file would be read in lazy way during processing each file. So in theory on a big enough spark cluster spark should take care of distributing and performance for you. The problem seems to be when you have to work with pretty read milions of files with unknown file size. You may end up huge memory and performance issues. This problem is shown e.g. in [this blog article](https://wrightturn.wordpress.com/2015/07/22/getting-spark-data-from-aws-s3-using-boto-and-pyspark/). Although it's pretty old I did not find any more recent solution to the issue. It also describes second approach.
   - Approach 2.
       - list all objects you're interested files in s3 bucket into some collection (but without parallelizing it)
       - create parallelized dataframe based on the given collection
       - read and process file content as part of transformations
   - The bottleneck might that you have to iterate over millions of files so the size of the collection to be processed (on one node) might be huge. 
   
As I am not able to test on a large set of data and big enough spark cluster which approach is more efficient. I am going to use approach described in [mentioned article](https://wrightturn.wordpress.com/2015/07/22/getting-spark-data-from-aws-s3-using-boto-and-pyspark). However instead of using boto3 for listing all objects I will use [`hadoop.fs.path.getFilesystem.globStatus`](https://stackoverflow.com/a/67050173/2018369) because boto3 [seems to be not the most effective way](https://stackoverflow.com/q/69920805/2018369) to get file list.

I was also considering one more approach, which however I could not find any good way to implement. So my idea was to create a dataframe similar to the one created by `spark.read.format('binaryFile').option("pathGlobFilter","<path-glob>").load(<s3-bucket>)`, but which contain only prefix of file (first 1024 or 2048 bytes). This way we could have a Dataframe(path, mod time,  length, PE headers), we could process the header of file to get all required PE metadata apart from imports/expors and in next step we could load apropriate sections of file to get imports/exports.


In [41]:
nord_path = 's3a://s3-nord-challenge-data/[01]/*.???'
#nord_path = 's3a://s3-nord-challenge-data/[01]/*'

In [61]:
clean_path = 's3a://s3-nord-challenge-data/0/*.???'
malware_path = 's3a://s3-nord-challenge-data/1/*.???'
hadoopFs = hadoopPath.getFileSystem(hadoop_conf)

cleanPath = sc._jvm.org.apache.hadoop.fs.Path(clean_path)
clean_files = hadoopFs.globStatus(cleanPath)

malwarePath = sc._jvm.org.apache.hadoop.fs.Path(malware_path)
malware_files = hadoopFs.globStatus(malwarePath)

In [65]:
import random
print(len(malware_files))
files_to_process = random.sample(clean_files, int(N/2))+ random.sample(malware_files, int(N/2))

14652


In [68]:
stat = clean_files[0]
print(stat.getPath().toUri().getRawPath())
print(stat.getPath().getName())
print(stat.getLen())

s3a://s3-nord-challenge-data/0/00ELuByj9iSRf5Rx11Ypl15N6kS2FXmW.dll
/0/00ELuByj9iSRf5Rx11Ypl15N6kS2FXmW.dll
00ELuByj9iSRf5Rx11Ypl15N6kS2FXmW.dll
3919


In [76]:
# put files into dataFrame

from pyspark.sql.types import StructType,StructField, StringType
schema = StructType([       
    StructField('path', StringType(), True),
    StructField('length', StringType(), True),
    StructField('type', StringType(), True)
])
data = [(f.getPath().toUri().getRawPath(), f.getLen(), f.getPath().getName().split('.')[-1]) for f in files_to_process]
filesDF = spark.createDataFrame(data=data, schema = schema)
filesDF.show()

[('/0/PXhzIGhqUe6rGif12xVsSSt4sBeSJ2QP.dll', 670032, 'dll'), ('/0/Bmg9xtsxAAfvmSHQmEOp1OiaQp76Q7Jy.dll', 1905152, 'dll'), ('/0/rIfed0htmYlwubNx7v7c0rNHMHg9PPJG.dll', 97872, 'dll'), ('/0/cHrgjOhFryTnqPuGDcOxU1rdFNVsGMvo.dll', 1658368, 'dll'), ('/0/PfFrpPSMATAEELBUMmb68AdAOek5qvQS.exe', 1128544, 'exe'), ('/0/408ak9Mx7JVIyRpOp0ZxZsthSibqf6Kw.exe', 129024, 'exe'), ('/0/PNc2CQjAsZ4ui7jCvoOpbDBJy3dTVZ55.dll', 17408, 'dll'), ('/0/nD1Kl1GVOnkSJI3erL0DK6PE5rTABSrM.exe', 86528, 'exe'), ('/0/E84kbW9mr7pDYYoinYOrEWn5YiQ05TQV.dll', 443571, 'dll'), ('/0/ZMfx4HRFZ952QimmrcxBWaCIQ2czgI8j.dll', 53840, 'dll'), ('/0/pp0fQ9i9yHfZYNonkeksqBYl3OByG9qd.dll', 492941, 'dll'), ('/0/H3xeXcrAV9D5w6CAcRbW6IqPe8sXWpb6.dll', 22833, 'dll'), ('/0/QYHi94rnlyMMNHVsuV1bg0Kif3WRw0fT.exe', 15389, 'exe'), ('/0/X3gMVFz8bns0XmBoNG3g4xhomgftBJhH.dll', 446976, 'dll'), ('/0/g6JNoxodcclCON5cDCH7nOLzY4tJEX2D.dll', 115, 'dll'), ('/0/usnNIDzEr7bRLPcynYmRGf9fu25PVCrU.dll', 357888, 'dll'), ('/0/a2yPEtghms1nVB2heexvCWJYIlu0O5mu.dll', 5

In [84]:
import boto3

ModuleNotFoundError: No module named 'boto3'

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 38896)
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/local/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/local/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/lib/python3.10/site-packages/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/usr/local/lib/python3.10/site-packages/pyspark/accumulators.py", line 235, in poll
    if func():
  File "/usr/local/lib/python3.10/site-packages/pyspark/accumulators.py", line 239, in accum_updates
    num_updates = read_