# spark-csv-to-parquet

Converts a CSV file with header to parquet using ApacheSpark

In [11]:
import findspark
findspark.init()

In [12]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import sys
import logging
import re

In [13]:
# source path and file name (default: data.csv)
data_csv = os.environ.get('data_csv', 'data.csv')

# destination path and parquet file name (default: data.parquet)
output_data_parquet = os.environ.get('output_data_parquet', 'data.parquet')

# url of master (default: local mode)
master = os.environ.get('master', "local[*]")

# temporal data storage for local execution
data_dir = os.environ.get('data_dir', '../data/')

In [14]:
# override parameters received from a potential call using %run magic
parameters = list(
  map(
      lambda s: re.sub('$', '"', s),
      map(
          lambda s: s.replace('=', '="'),
          filter(
              lambda s: s.find('=') > -1 and bool(re.match('[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)),
              sys.argv
          )
      )
  )
)

for parameter in parameters:
    logging.warning('Parameter: '+parameter) 
    exec(parameter)

In [15]:
data_parquet = os.environ.get('data_parquet', 'data.parquet')
if os.path.exists(data_dir + data_parquet):
    skip = True

In [16]:
skip = False
if os.path.exists(data_dir + data_parquet):
    skip = True

In [17]:
if not skip:
    sc = SparkContext.getOrCreate(SparkConf().setMaster(master))
    spark = SparkSession.builder.getOrCreate()

In [18]:
if not skip:
    df = spark.read.option('header', 'true').csv(data_dir + data_csv)

In [19]:
if not skip:
    df.write.parquet(data_dir + output_data_parquet)