In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("DataFrame-Ex").getOrCreate()

In [3]:
file_path = "data/SampleData/TCB_2018_2020.csv"
df = spark.read.csv(file_path, header=True)
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Avg: string (nullable = true)
 |-- Volume: string (nullable = true)



In [4]:
df.show(5)

+----------+-----+-----+-----+-----+------+---------+
|      Date| High|  Low| Open|Close|   Avg|   Volume|
+----------+-----+-----+-----+-----+------+---------+
|2018-06-04|105.0|102.4|102.4|102.4|102.49|2811840.0|
|2018-06-05|106.0| 96.0| 99.1| 96.0|100.19|1689500.0|
|2018-06-06| 96.0| 91.0| 95.0| 92.0| 92.98|1901680.0|
|2018-06-07| 98.4| 93.1| 94.5| 98.4|  97.0|1476540.0|
|2018-06-08|105.2| 99.5|101.0|105.2|103.83|2008500.0|
+----------+-----+-----+-----+-----+------+---------+
only showing top 5 rows



In [5]:
from pyspark.sql.types import StructType, StructField, DoubleType, DateType
schema = StructType([
    StructField(name="Date", dataType=DateType(), nullable=True),
    StructField(name="High", dataType=DoubleType(), nullable=True),
    StructField(name="Low", dataType=DoubleType(), nullable=True),
    StructField(name="Open", dataType=DoubleType(), nullable=True),
    StructField(name="Close", dataType=DoubleType(), nullable=True),
    StructField(name="Avg", dataType=DoubleType(), nullable=True),
    StructField(name="Volume", dataType=DoubleType(), nullable=True)
])

In [6]:
df = spark.read.csv(file_path, header=True, schema=schema)

In [7]:
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Avg: double (nullable = true)
 |-- Volume: double (nullable = true)



In [24]:
json_file_path = "data/SampleData/iris.json"
df = spark.read.json(json_file_path, multiLine=True)

In [25]:
df.printSchema()

root
 |-- petalLength: double (nullable = true)
 |-- petalWidth: double (nullable = true)
 |-- sepalLength: double (nullable = true)
 |-- sepalWidth: double (nullable = true)
 |-- species: string (nullable = true)



In [27]:
df.show(5)

+-----------+----------+-----------+----------+-------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|
+-----------+----------+-----------+----------+-------+
|        1.4|       0.2|        5.1|       3.5| setosa|
|        1.4|       0.2|        4.9|       3.0| setosa|
|        1.3|       0.2|        4.7|       3.2| setosa|
|        1.5|       0.2|        4.6|       3.1| setosa|
|        1.4|       0.2|        5.0|       3.6| setosa|
+-----------+----------+-----------+----------+-------+
only showing top 5 rows

