In [1]:
# Requirement 1:
    # Read from .xls
    # Write to .xls

# Excel Python codecs: (Read/Write)
    # xlrd, xlwt : For xls format decoding/encoding
    # calamine : Rust implementation, binded to Python for xls, xlsx, xlsm, xlsb, xla, xlam decoding
    # openpyxl : For '.xlsx', '.xlsm', '.xltx', '.xltm' decoding/encoding
    # pandas: Library providing high-level API based on above 3(and more) libraries.

# Apache Spark "DataSource" compatible codecs:
    # Crealytics Spark Excel - "com.crealytics:spark-excel_2.12:3.5.1_0.20.4"

# Most libraries interpret .xlsx as "flat" data, 
# without support for operations/functions and derived cells. (???)

In [2]:
# Requirement 2:
  # Call api

# HTTP REST API clients:
  # requests: HTTP REST library (Use with Spark as a Python UDF?)

# Apache Spark "DataSource" compatible HTTP REST API:


In [3]:
# Requirement 3:
    # Do debugging
    # Fix errors
    # Record error handling

# Logging in Python:
    # logging library
    # print() statements

In [4]:
# Requirement 4:
    # CRUDE Operations add, edit delete with database, python, web UI

# Unsure how to implement HTTP servers in distributed manner..

In [5]:
from pyspark.sql import SparkSession

In [45]:
# MongoDB connection configuration
secrets = {
  'mongodb':{
    'connectionString': "localhost:1001"
  }
}
connectionString = secrets['mongodb']['connectionString']

# Catalyst optimizer extensions for connectors:
extensionClasses = [
]

mvnPackages = [
  "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0", # MongoDB-Spark connector
  "com.crealytics:spark-excel_2.12:3.5.1_0.20.4", # Excel file format decoder
]

# Read options for Spark DataFrameReader Excel Format:

excelReadOpts = {
  
  # 'dataAddress': "A1", # Optional, default: "A1"

  'header': 'true', # Required
  'treatEmptyValuesAsNulls': 'false', # Optional, default: true

  # # Optional, default: false, where errors will be converted to null. 
  # # If true, any ERROR cell values (e.g. #N/A) will be converted to 
  # # the zero values of the column's data type.
  # 'setErrorCellsToFallbackValues': 'true',
  
  # Optional, default: false, If true, format the cells 
  # without rounding and scientific notations
  'usePlainNumberFormat': 'false',
  
  'inferSchema': 'false', # Optional, default: false
  'addColorColumns': 'true', # Optional, default: false

  # Optional, default: yyyy-mm-dd hh:mm:ss[.fffffffff]
  'timestampFormat': 'MM-dd-yyyy HH:mm:ss', 

  # # Optional, default None. If set, uses a streaming reader 
  # # which can help with big files (will fail if used with .xls format files)
  # 'maxRowsInMemory': 20,

  # # Optional, default None. 
  # # See https://poi.apache.org/apidocs/5.0/org/apache/poi/util/IOUtils.html#setByteArrayMaxOverride-int-
  # 'maxByteArraySize': 2147483647,

  # Optional, default None. Number of bytes at which a 
  # zip entry is regarded as too large for holding in 
  # memory and the data is put in a temp file instead
  'tempFileThreshold': 10000000,

  # Optional, default: 10. If set and if schema inferred, 
  # number of rows to infer schema from
  'excerptSize': 10,
  
  # # Optional, default None. 
  # # Requires unlimited strength JCE for older JVMs
  # 'workbookPassword': 'pass'
}

# It does not allow to add packages built as "Wheels" and 
# therefore does not allow to include dependencies with native code

# .zip Python packages ('egg') or .py file dependencies
pyPackagePaths = [
  "", # For making REST API calls
]

# Spark Session configuration:
spark = SparkSession \
  .builder \
  .master('local[8]') \
  .appName("API->DB") \
  .config("spark.executor.memory", "4g") \
  .config("spark.driver.memory", "8g") \
  .config("spark.jars.packages",','.join(mvnPackages)) \
  .config("spark.sql.extensions", ','.join(extensionClasses)) \
  .config("spark.mongodb.read.connection.uri",connectionString) \
  .config("spark.mongodb.read.database","api") \
  .config("spark.mongodb.write.database","api") \
  .config("spark.mongodb.read.readPreference.name","nearest") \
  .getOrCreate()


In [49]:
# Read a DataFrame from a excel file: (Local or HDFS)

readPath = "venkata_tasks/SampleData.xls"
dataAddress = "'SalesOrders'!A1"

# readPath = "venkata_tasks/fsi-2006.xlsx"
# dataAddress = "A1"

df = spark.read \
  .format('excel') \
  .option("dataAddress", dataAddress) \
  .options(**excelReadOpts) \
  .load(readPath)

df.printSchema()

root
 |-- OrderDate: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Rep: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Units: string (nullable = true)
 |-- Unit Cost: string (nullable = true)
 |-- Total: string (nullable = true)



In [50]:
# df.select("country").orderBy("country").show(10,truncate=False)
df.count()

24/07/08 23:35:29 WARN ExcelHeaderChecker: Number of column in Excel header is not equal to number of fields in the schema:
 Header length: 7, schema size: 0
Excel file: file:///Users/sounak/Personal/assignments/venkata_tasks/SampleData.xls


43