In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
      .master("local[1]") \
      .appName("NORD_Task") \
      .getOrCreate() 
sc = spark.sparkContext




hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

S3_BUCKET = 's3-nord-challenge-data'
S3_REGION = 'eu-central-1'
hadoop_conf.set("fs.s3a.endpoint", f"s3.{S3_REGION}.amazonaws.com")
# see http://blog.encomiabile.it/2015/10/29/apache-spark-amazon-s3-and-apache-mesos/
# hadoop_conf.set("fs.s3a.connection.maximum", "100000")


#number of files to process - will be read as input
N = 100

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.amazonaws#aws-java-sdk added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
mysql#mysql-connector-java added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5df16b46-a813-4e06-8684-8c75d68af04f;1.0
	confs: [default]
	found com.amazonaws#aws-java-sdk;1.11.901 in central
	found com.amazonaws#aws-java-sdk-appregistry;1.11.901 in central
	found com.amazonaws#aws-java-sdk-core;1.11.901 in central
	found commons-logging#commons-logging;1.1.3 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found commons-codec#commons-codec;1.11 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found com.fasterxml.jackson.core#jackson-databind;2.6.7.3 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.6.0 in central
	found com.fasterxml.jacks

	found com.amazonaws#aws-java-sdk-cloud9;1.11.901 in central
	found com.amazonaws#aws-java-sdk-serverlessapplicationrepository;1.11.901 in central
	found com.amazonaws#aws-java-sdk-alexaforbusiness;1.11.901 in central
	found com.amazonaws#aws-java-sdk-resourcegroups;1.11.901 in central
	found com.amazonaws#aws-java-sdk-comprehend;1.11.901 in central
	found com.amazonaws#aws-java-sdk-translate;1.11.901 in central
	found com.amazonaws#aws-java-sdk-sagemaker;1.11.901 in central
	found com.amazonaws#aws-java-sdk-iotjobsdataplane;1.11.901 in central
	found com.amazonaws#aws-java-sdk-sagemakerruntime;1.11.901 in central
	found com.amazonaws#aws-java-sdk-kinesisvideo;1.11.901 in central
	found io.netty#netty-codec-http;4.1.48.Final in central
	found io.netty#netty-common;4.1.48.Final in central
	found io.netty#netty-buffer;4.1.48.Final in central
	found io.netty#netty-transport;4.1.48.Final in central
	found io.netty#netty-resolver;4.1.48.Final in central
	found io.netty#netty-codec;4.1.48.Fi

downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.901/aws-java-sdk-bundle-1.11.901.jar ...
	[SUCCESSFUL ] com.amazonaws#aws-java-sdk-bundle;1.11.901!aws-java-sdk-bundle.jar (307133ms)
downloading https://repo1.maven.org/maven2/org/wildfly/openssl/wildfly-openssl/1.0.7.Final/wildfly-openssl-1.0.7.Final.jar ...
	[SUCCESSFUL ] org.wildfly.openssl#wildfly-openssl;1.0.7.Final!wildfly-openssl.jar (600ms)
downloading https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.0.31/mysql-connector-j-8.0.31.jar ...
	[SUCCESSFUL ] com.mysql#mysql-connector-j;8.0.31!mysql-connector-j.jar (3947ms)
downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/3.19.4/protobuf-java-3.19.4.jar ...
	[SUCCESSFUL ] com.google.protobuf#protobuf-java;3.19.4!protobuf-java.jar(bundle) (2206ms)
:: resolution report :: resolve 8084ms :: artifacts dl 314064ms
	:: modules in use:
	com.amazonaws#aws-java-sdk;1.11.901 from central in [default]
	com.amazonaws#aws-java

	254 artifacts copied, 0 already retrieved (379614kB/438ms)
22/12/04 04:13:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Note 
In general I can see 2 approaches to load files data:
   - Approach 1.
     - `spark.read.format('binaryFile').option("pathGlobFilter","<path-glob>").load(<s3-bucket>)`. This solution would read all files with metadata into single DataFrame (path, mod time,  length, content)
     - parse content of file in apropriate resulted dataframe transformation
   - The advantage of it is that you receive parallelized DataFrame, content of file would be read in lazy way during processing each file. So in theory on a big enough spark cluster spark should take care of distributing and performance for you. The problem seems to be when you have to work with pretty read milions of files with unknown file size. You may end up huge memory and performance issues. This problem is shown e.g. in [this blog article](https://wrightturn.wordpress.com/2015/07/22/getting-spark-data-from-aws-s3-using-boto-and-pyspark/). Although it's pretty old I did not find any more recent solution to the issue. It also describes second approach.
   - Approach 2.
       - list all objects you're interested files in s3 bucket into some collection (but without parallelizing it)
       - create parallelized dataframe based on the given collection
       - read and process file content as part of transformations
   - The bottleneck might that you have to iterate over millions of files so the size of the collection to be processed (on one node) might be huge. 
   
As I am not able to test on a large set of data and big enough spark cluster which approach is more efficient. I am going to use approach described in [mentioned article](https://wrightturn.wordpress.com/2015/07/22/getting-spark-data-from-aws-s3-using-boto-and-pyspark). However instead of using boto3 for listing all objects I will use [`hadoop.fs.path.getFilesystem.globStatus`](https://stackoverflow.com/a/67050173/2018369) because boto3 [seems to be not the most effective way](https://stackoverflow.com/q/69920805/2018369) to get file list.

I was also considering one more approach, which however I could not find any good way to implement. So my idea was to create a dataframe similar to the one created by `spark.read.format('binaryFile').option("pathGlobFilter","<path-glob>").load(<s3-bucket>)`, but which contain only prefix of file (first 1024 or 2048 bytes). This way we could have a Dataframe(path, mod time,  length, PE headers), we could process the header of file to get all required PE metadata apart from imports/expors and in next step we could load apropriate sections of file to get imports/exports.


In [2]:
clean_path = '/0/*.???'
malware_path = '/1/*.???'



cleanPath = sc._jvm.org.apache.hadoop.fs.Path(f's3a://{S3_BUCKET}{clean_path}')
cFs = cleanPath.getFileSystem(hadoop_conf)
clean_files = cFs.globStatus(cleanPath)

malwarePath = sc._jvm.org.apache.hadoop.fs.Path(f's3a://{S3_BUCKET}{malware_path}')
mFs = malwarePath.getFileSystem(hadoop_conf)
malware_files = mFs.globStatus(malwarePath)

22/12/04 04:14:54 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [None]:
import random
print(len(malware_files))
files_to_process = random.sample(clean_files, int(N/2))+ random.sample(malware_files, int(N/2))
print(len(files_to_process))

In [None]:
stat = clean_files[0]
print(stat.getPath().toUri().getRawPath())
print(stat.getPath().getName())
print(stat.getLen())

In [None]:
# put files into dataFrame

from pyspark.sql.types import StructType,StructField, StringType, IntType
schema = StructType([       
    StructField('path', StringType(), True),
    StructField('size', StringType(), True),
    StructField('type', StringType(), True)
])
data = [(f.getPath().toUri().getRawPath(), f.getLen(), f.getPath().getName().split('.')[-1]) for f in files_to_process]
# make sure we don't have duplicates
filesDF= spark.createDataFrame(data=data, schema = schema).distinct()
filesDF.show()

In [None]:
# create table in database if not exists
from pyspark.sql import SQLContext
jdbc_url = 'jdbc:mysql://db/nord_files'
sqlContext = SQLContext(spark)
sqlContext.sql(
    "CREATE TABLE if not exists files_info (path Varchar(),size Int, type Varchar(), architecture Varchar() default NULL, impports Int default 0, exports Int default 0)").write.format('jdbc').options(url ="jdbc:mysql://localhost/employees",driver="com.mysql.jdbc.Driver",dbtable="nord_files",user="root",password="password").mode('append').save()
'''


In [None]:
# process files 

from s3file_reader import S3FileReader
from pe_parser import PeParser
#import importlib
#importlib.reload(S3FileReader)

reader = S3FileReader(S3_BUCKET, S3_REGION)

schema = StructType(filesDF.schema.fields+[
    StructField('architecture', StringType(), True),
    StructField('imports', StringType(), True),
    StructField('exports', StringType(), True)
])
#TODO: This should be fixed
parsed=filesDF.rdd.map(lambda x: (*x, *PeParser(x['path'], reader.get_file_stream(x['path']),x['size']).get_short_meta()))

parsedDF = parsed.toDF(schema)


In [None]:
# process files 

from s3file_reader import S3FileReader
from pe_parser import PeParser
#import importlib
#importlib.reload(S3FileReader)

reader = S3FileReader(S3_BUCKET, S3_REGION)

schema = StructType(filesDF.schema.fields+[
    StructField('architecture', StringType(), True),
    StructField('imports', IntType(), True),
    StructField('exports',IntType(), True)
])
#TODO: This should be fixed
parsed=filesDF.rdd.map(lambda x: (*x, *PeParser(x['path'], reader.get_file_stream(x['path']),x['size']).get_short_meta()))

parsedDF = parsed.toDF(schema)


In [None]:
#TODO: store to database