## Create a DataFrame with first_name and age columns and four rows of data

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession.builder \
    .appName("Create DataFrame") \
    .getOrCreate()

data = [('ABC', 30),
        ('XYZ', 25),
        ('PQR', 35),
        ('GHI', 28)]

rdd = spark.sparkContext.parallelize(data)
row_rdd = rdd.map(lambda x: Row(first_name=x[0], age=int(x[1])))

df = spark.createDataFrame(row_rdd)

df.show()

+----------+---+
|first_name|age|
+----------+---+
|       ABC| 30|
|       XYZ| 25|
|       PQR| 35|
|       GHI| 28|
+----------+---+



## View the contents of the DataFrame

In [0]:
df.show()

+----------+---+
|first_name|age|
+----------+---+
|       ABC| 30|
|       XYZ| 25|
|       PQR| 35|
|       GHI| 28|
+----------+---+



## Add a column life_stage  to the DataFrame that returns “child” if the age is 12 or under, “teenager” if the age is between 13 and 19, and “adult” if the age is 20 or older.
(Note: You can refer Spark API reference document for functions)

In [0]:
from pyspark.sql.functions import when

df = df.withColumn('life_stage',
                   when(df['age'] <= 12, 'child')
                   .when((df['age'] >= 13) & (df['age'] <= 19), 'teenager')
                   .otherwise('adult'))
df.show()


+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       ABC| 30|     adult|
|       XYZ| 25|     adult|
|       PQR| 35|     adult|
|       GHI| 28|     adult|
+----------+---+----------+



## Save the DataFrame in a named Parquet table and then access the table using the table name

In [0]:
df.write.mode('overwrite').saveAsTable('employee')

spark.sql("SELECT * FROM employee").show()

+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       XYZ| 25|     adult|
|       PQR| 35|     adult|
|       GHI| 28|     adult|
|       ABC| 30|     adult|
+----------+---+----------+



## Use SQL query to insert a few more rows of data into the table

In [0]:
additional_data = [
    ('STV', 22, 'adult'),
    ('WXY', 16, 'teenager')
]

additional_df = spark.createDataFrame(additional_data, ['first_name', 'age', 'life_stage'])

additional_df.createOrReplaceTempView("additional_data")

spark.sql("""
    INSERT INTO employee (first_name, age, life_stage)
    SELECT first_name, age, life_stage FROM additional_data
""")

spark.sql("SELECT * FROM employee").show()

+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       WXY| 16|  teenager|
|       XYZ| 25|     adult|
|       PQR| 35|     adult|
|       GHI| 28|     adult|
|       STV| 22|     adult|
|       ABC| 30|     adult|
+----------+---+----------+



## Write a sql query that returns the teenagers

In [0]:
teenagers_query = """
    SELECT * FROM employee
    WHERE life_stage = 'teenager'
"""
spark.sql(teenagers_query).show()

+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       WXY| 16|  teenager|
+----------+---+----------+

