In [0]:
# Reading Data from Files
import datetime
import pyspark.sql.functions as F
from pyspark.sql import Row
import pandas as pd
from pyspark.sql.types import *

In [0]:
%fs ls /public/retail_db

In [0]:
schema = """
    order_id INT,
    order_date DATE,
    order_customer_id INT,
    order_status STRING
"""

In [0]:
orders = spark.read.schema(schema).csv('/public/retail_db/orders')

In [0]:
orders.show()

In [0]:
orders.printSchema()

In [0]:
%fs ls /public/retail_db_json

In [0]:
orders = spark.read.json('/public/retail_db_json/orders')

In [0]:
orders.show()

In [0]:
import getpass

username = getpass.getuser()

In [0]:
input_dir = '/public/retail_db_json'
output_dir = f'/user/{username}/root/retail_db_parquet'

In [0]:
dbutils.fs.ls(input_dir)

In [0]:
for file_details in dbutils.fs.ls(input_dir):
    if not ('.git' in file_details.path or file_details.path.endswith('.sql')):
        print(f'Converting data in {file_details.path} folder from json to parquet')
        df = spark.read.json(file_details.path)
        df.coalesce(1).write.parquet(f'{output_dir}/{file_details.path.split("/")[-2]}', mode='overwrite')

In [0]:
dbutils.fs.ls(output_dir)

In [0]:
dbutils.fs.ls(output_dir + '/orders')

In [0]:
orders = spark.read.parquet(output_dir + '/orders')

orders.show()

In [0]:
# Copy all the data with comma separator to pipe separator
input_dir = '/public/retail_db'
output_dir = f'/user/{username}/root/retail_db_pipe'

In [0]:
dbutils.fs.ls('/public/retail_db')

In [0]:
for file_details in dbutils.fs.ls(input_dir):
    if not ('.git' in file_details.path or file_details.path.endswith('.sql')):
        print(f'Converting data in {file_details.path} folder from comma separated to pipe separated')
        df = spark.read.csv(file_details.path)
        df.coalesce(1).write.mode('overwrite').csv(f'{output_dir}/{file_details.path.split("/")[-2]}', sep='|')

In [0]:
orders = spark.read.schema(schema).csv(f'/user/{username}/root/retail_db_pipe/orders', sep='|')

In [0]:
orders.show()

In [0]:
type(spark.read)

In [0]:
spark

In [0]:
%fs ls /public/retail_db/orders

In [0]:
%fs ls /public/retail_db_json/orders

In [0]:
spark.read.text('/public/retail_db/orders').show(truncate=False)

In [0]:
spark.read.json('/public/retail_db_json/orders').show(truncate=False)

In [0]:
# Reading CSV files
orders = spark.read.csv('/public/retail_db/orders')

In [0]:
orders.columns

In [0]:
orders.dtypes

In [0]:
help(spark.read.schema)

In [0]:
help(spark.read.csv)

In [0]:
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""

In [0]:
spark.read.schema(schema).csv('/public/retail_db/orders').show()

In [0]:
spark.read.csv('/public/retail_db/orders', schema=schema).show()

In [0]:
help(StructField)

In [0]:
schema = StructType([
    StructField('order_id', IntegerType()),
    StructField('order_date', TimestampType()),
    StructField('order_customer_id', IntegerType()),
    StructField('order_status', StringType())
])

In [0]:
spark.read.schema(schema).csv('/public/retail_db/orders').show()

In [0]:
spark.read.csv('/public/retail_db/orders', schema=schema).show()

In [0]:
help(StructField)

In [0]:
schema = StructType([
    StructField('order_id', IntegerType(), nullable=False),
    StructField('order_date', TimestampType(), nullable=False),
    StructField('order_customer_id', IntegerType(), nullable=False),
    StructField('order_status', StringType(), nullable=False)
])

In [0]:
columns = ['order_id', 'order_date', 'order_customer_id', 'order_status']

In [0]:
spark.read.option('inferSchema', True).csv('/public/retail_db/orders').toDF(*columns)

In [0]:
spark.read.option('inferSchema', True).csv('/public/retail_db/orders')

In [0]:
spark.read.csv('/public/retail_db/orders', inferSchema=True).toDF(*columns)

In [0]:
import getpass
username = getpass.getuser()

In [0]:
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""

In [0]:
spark.read.schema(schema).csv(f'/user/{username}/{username}/retail_db_pipe/orders', sep='|').show()

In [0]:
spark.read.csv(f'/user/{username}/{username}/retail_db_pipe/orders', sep='|', schema=schema).show()

In [0]:
orders = spark.read.csv(f'/user/{username}/{username}/retail_db_pipe/orders')

In [0]:
orders.show()

In [0]:
orders = spark.read.csv(
    f'/user/{username}/{username}/retail_db_pipe/orders',
    sep='|',
    header=True,
    inferSchema=True
) \
.toDF(*columns)

In [0]:
orders.show()

In [0]:
spark.read.format('csv').load(
    f'/user/{username}/{username}/retail_db_pipe/orders',
    sep='|',
    header=True,
    inferSchema=True
) \
.toDF(*columns)

In [0]:
help(spark.read.option)

In [0]:
help(spark.read.options)

In [0]:
orders = spark.read \
    .option('sep', '|') \
    .option('header', None) \
    .option('inferSchema', True) \
    .csv(f'/user/{username}/{username}/retail_db_pipe/orders') \
    .toDF(*columns)

In [0]:
orders.show()

In [0]:
spark.read.options(sep='|', header=True, inferSchema=True).csv(f'/user/{username}/{username}/retail_db_pipe/orders').toDF(*columns).show()

In [0]:
options = {
    'sep': '|',
    'header': True,
    'inferSchema': True

}

spark.read.options(**options).csv(f'/user/{username}/{username}/retail_db_pipe/orders').toDF(*columns).show()

In [0]:
# Reading JSON Files
df = spark.read.json('/public/retail_db_json/orders')

In [0]:
df = spark.read.format('json').load('/public/retail_db_json/orders')

In [0]:
df.inputFiles()

In [0]:
df.dtypes

In [0]:
df.show()

In [0]:
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""

In [0]:
spark.read.schema(schema).json('/public/retail_db_json/orders').show()

In [0]:
# Reading Parquet Files
help(spark.read.parquet)

In [0]:
df = spark.read.parquet(f'/user/{username}/{username}/retail_db_parquet/orders')

In [0]:
df.inputFiles()

In [0]:
df.dtypes

In [0]:
df.show()

In [0]:
spark.read.parquet(f'/user/{username}/{username}/retail_db_parquet/orders', schema=schema).show()

In [0]:
orders.withColumn('order_date', F.col('order_date').cast('Timestamp')).show()