In [7]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
spark = SparkSession.builder.appName("Create-DataFrame").getOrCreate()

In [2]:
DATA_DIR = '/opt/data/'

In [14]:
os.listdir(DATA_DIR)

['products.csv', 'products_singleline.json']

In [4]:
file_path = DATA_DIR + 'products.csv'
df = spark.read.csv(file_path, header=True)

In [5]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price: string (nullable = true)



In [6]:
df.show(5)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



In [8]:
# Define the schema
schema = StructType([
    StructField(name="id", dataType=IntegerType(), nullable=True),
    StructField(name="name", dataType=StringType(), nullable=True),
    StructField(name="category", dataType=StringType(), nullable=True),
    StructField(name="quantity", dataType=IntegerType(), nullable=True),
    StructField(name="price", dataType=DoubleType(), nullable=True)
])

In [10]:
df = spark.read.csv(file_path, header=True, schema=schema)

In [11]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



In [12]:
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [13]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



In [16]:
%%bash
head -10 /opt/data/products_singleline.json

{"id":1,"name":"iPhone 12","category":"Electronics","quantity":10,"price":899.99}
{"id":2,"name":"Nike Air Max 90","category":"Clothing","quantity":25,"price":119.99}
{"id":3,"name":"KitchenAid Stand Mixer","category":"Home Appliances","quantity":5,"price":299.99}
{"id":4,"name":"The Great Gatsby","category":"Books","quantity":50,"price":12.99}
{"id":5,"name":"L'Oreal Paris Mascara","category":"Beauty","quantity":100,"price":9.99}
{"id":6,"name":"Yoga Mat","category":"Sports","quantity":30,"price":29.99}
{"id":7,"name":"Samsung 4K Smart TV","category":"Electronics","quantity":8,"price":799.99}
{"id":8,"name":"Levi's Jeans","category":"Clothing","quantity":15,"price":49.99}
{"id":9,"name":"Dyson Vacuum Cleaner","category":"Home Appliances","quantity":3,"price":399.99}
{"id":10,"name":"Harry Potter Series","category":"Books","quantity":20,"price":15.99}


In [17]:
json_file_path =  DATA_DIR + "products_singleline.json"
df = spark.read.json(json_file_path)

In [18]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
+---------------+---+--------------------+------+--------+
only showing top 5 rows



In [22]:
os.listdir('/')

['lib',
 'lib64',
 'media',
 'var',
 'root',
 'srv',
 'run',
 'dev',
 'usr',
 'boot',
 'bin',
 'sys',
 'libx32',
 'tmp',
 'home',
 'opt',
 'mnt',
 'etc',
 'lib32',
 'sbin',
 'proc',
 '.dockerenv']

In [None]:
- ./notebook:/opt/notebook