In [11]:
from pyspark.sql.types import StructType, StructField, StringType, LongType
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("5 Basic Structured Operations").getOrCreate()

my_schema = StructType([
    StructField("some", StringType(), True),
    StructField("colu", StringType(), True),
    StructField("name", LongType(), False)
    ])

my_row = Row("Hello", None, 1)

my_df = spark.createDataFrame([my_row], my_schema)

my_df.show()
my_df.printSchema()

+-----+----+----+
| some|colu|name|
+-----+----+----+
|Hello|null|   1|
+-----+----+----+

root
 |-- some: string (nullable = true)
 |-- colu: string (nullable = true)
 |-- name: long (nullable = false)



In [None]:
# Read data:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("5 Basic Structured Operations").getOrCreate()
df = spark.read.format("json").load("D:\\GitLocal\\Spark-The-Definitive-Guide\\data\\flight-data\\json\\2015-summary.json")
#df.show()

# Schema enforcement:
from pyspark.sql.types import StructType, StructField, StringType, LongType
my_manual_schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello":"world"})
])


df = spark.read.format("json").schema(my_manual_schema).load("D:\\GitLocal\\Spark-The-Definitive-Guide\\data\\flight-data\\json\\2015-summary.json")
df.show()
df.printSchema()


In [15]:
from pyspark.sql.functions import col, expr, column
df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")).show()

+--------------------+
|   DEST_COUNTRY_NAME|
+--------------------+
|       United States|
|       United States|
|       United States|
|               Egypt|
|       United States|
|       United States|
|       United States|
|          Costa Rica|
|             Senegal|
|             Moldova|
|       United States|
|       United States|
|              Guyana|
|               Malta|
|            Anguilla|
|             Bolivia|
|       United States|
|             Algeria|
|Turks and Caicos ...|
|       United States|
+--------------------+
only showing top 20 rows



In [16]:
df.selectExpr(
    "*", "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as within_country").show(3)

+-----------------+-------------------+-----+--------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|within_country|
+-----------------+-------------------+-----+--------------+
|    United States|            Romania|   15|         false|
|    United States|            Croatia|    1|         false|
|    United States|            Ireland|  344|         false|
+-----------------+-------------------+-----+--------------+
only showing top 3 rows

