# In-Class Lab: PySpark CSV and SQL functions

# Setting

In [526]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = (
    SparkSession.builder.appName("PySpark with CSV")
    .master("local[*]")
    .config("spark.driver.host", "localhost")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.memory", "2g")
    .getOrCreate()
)

In [527]:
print(spark.sparkContext.pythonExec)

python3


## 1. Read the csv file in PySpark
## 2. Create the schema.

In [528]:
schema = (
    StructType()
    .add("RecordNumber", IntegerType(), True)
    .add("Country", StringType(), True)
    .add("City", StringType(), True)
    .add("Zipcode", IntegerType(), True)
    .add("State", StringType(), True)
)
df_with_schema = spark.read.format('csv').option('header', True).schema(schema).load('simple-zipcodes.csv')

In [529]:
df_with_schema.printSchema()

root
 |-- RecordNumber: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- State: string (nullable = true)



## 3. Create a table ‘customer_demographics’ with the csv file.

In [530]:
df_with_schema.createOrReplaceTempView('customer_demographics')

## 4. Select Country and City from the table.

In [531]:
spark.sql(""" SELECT Country, City FROM customer_demographics""").show()

+-------+-------------------+
|Country|               City|
+-------+-------------------+
|     US|        PARC PARQUE|
|     US|PASEO COSTA DEL SUR|
|     US|       BDA SAN LUIS|
|     US|               HOLT|
|     US|          HOMOSASSA|
|     US|  CINGULAR WIRELESS|
|     US|         FORT WORTH|
|     US|           FT WORTH|
|     US|        SPRUCE PINE|
|     US|           ASH HILL|
|     US|    URB EUGENE RICE|
|     US|               MESA|
|     US|               MESA|
|     US|           HILLIARD|
|     US|             HOLDER|
|     US|      SECT LANAUSSE|
|     US|      SPRING GARDEN|
|     US|        SPRINGVILLE|
|     US|           ASHEBORO|
|     US|           ASHEBORO|
+-------+-------------------+



## 5. Select records of all the cities that start with ‘F’ and ‘S’.

In [532]:
spark.sql("""   SELECT 
                    City
                FROM
                    customer_demographics
                WHERE
                    City LIKE 'F%'
                    or City LIKE 'S%' 
                """).show()

+-------------+
|         City|
+-------------+
|   FORT WORTH|
|     FT WORTH|
|  SPRUCE PINE|
|SECT LANAUSSE|
|SPRING GARDEN|
|  SPRINGVILLE|
+-------------+



## 6. Select all the cities which have a 3 digit zipcode.

In [533]:
spark.sql("""   SELECT 
                    City, Zipcode
                FROM
                    customer_demographics
                WHERE
                    Zipcode RLIKE '^[0-9]{3}$'
                """).show()

+-------------------+-------+
|               City|Zipcode|
+-------------------+-------+
|        PARC PARQUE|    704|
|PASEO COSTA DEL SUR|    704|
|       BDA SAN LUIS|    709|
|    URB EUGENE RICE|    704|
|      SECT LANAUSSE|    704|
+-------------------+-------+



## 7. Order the cities by zipcode in descending order.

In [534]:
result = spark.sql("""   SELECT 
                    *
                FROM
                    customer_demographics
                ORDER BY Zipcode DESC
                """)
result.show()

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
+------------+-------+-------------------+-------+-----+
|       39828|     US|               MESA|  85210|   AZ|
|       39827|     US|               MESA|  85209|   AZ|
|       61392|     US|         FORT WORTH|  76177|   TX|
|       61393|     US|           FT WORTH|  76177|   TX|
|       61391|     US|  CINGULAR WIRELESS|  76166|   TX|
|       54354|     US|      SPRING GARDEN|  36275|   AL|
|       54356|     US|        SPRUCE PINE|  35585|   AL|
|       54355|     US|        SPRINGVILLE|  35146|   AL|
|       49348|     US|          HOMOSASSA|  34487|   FL|
|       49346|     US|             HOLDER|  34445|   FL|
|       49347|     US|               HOLT|  32564|   FL|
|       49345|     US|           HILLIARD|  32046|   FL|
|       76513|     US|           ASHEBORO|  27204|   NC|
|       76512|     US|           ASHEBORO|  27203|   NC|
|       76511|     US|         

## 8. Write the result to a csv file ‘Customer_demographics.csv’

In [535]:
result.write.mode("overwrite").option("header", "true").csv("Customer_demographics")