# Structured Spark API Examples


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "5")


In [2]:
studentsdf = spark.read.csv("../data/StudentsPerformance.csv",header='true')

In [35]:
studentsdf.show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [6]:
df = spark.range(100).toDF("number")
df.select(df["number"]+10)
df.show(10)

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
+------+
only showing top 10 rows



In [36]:
df.filter(df['number'] > 5).show(5)

+------+
|number|
+------+
|     6|
|     7|
|     8|
|     9|
|    10|
+------+
only showing top 5 rows



In [19]:
studentsdf.printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: string (nullable = true)
 |-- reading score: string (nullable = true)
 |-- writing score: string (nullable = true)



In [28]:
# Infer the schema, and register the DataFrame as a table.  WHERE cast('math score' as int) >= 90"
studentsdf.createOrReplaceTempView("studentsql")
maths = spark.sql("SELECT cast('math score' as int) as age FROM studentsql WHERE cast('math score' as int) >= 90" )
maths.show()

+---+
|age|
+---+
+---+



In [29]:
studentsdf.schema

StructType(List(StructField(gender,StringType,true),StructField(race/ethnicity,StringType,true),StructField(parental level of education,StringType,true),StructField(lunch,StringType,true),StructField(test preparation course,StringType,true),StructField(math score,StringType,true),StructField(reading score,StringType,true),StructField(writing score,StringType,true)))

In [34]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

mySchema = StructType([
StructField("gender", StringType(), True),
StructField("race/ethnicity", StringType(), True),
StructField("parental level of education", LongType(), False),
StructField("lunch", StringType(), True), 
StructField("test preparation course", StringType(), True),
StructField("math score", LongType(), True), 
StructField("reading score", LongType(), True), 
StructField("writing score", LongType(), True)
])
studentsdfmyschema = spark.read.format("csv").schema(mySchema).load("../data/StudentsPerformance.csv")
studentsdfmyschema.printSchema()
studentsdfmyschema.drop('parental level of education').show(5)


root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: long (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: long (nullable = true)
 |-- reading score: long (nullable = true)
 |-- writing score: long (nullable = true)

+------+--------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|       lunch|   test preparation ...|      null|         null|         null|
|female|       group B|    standard|                   none|        72|           72|           74|
|female|       group C|    standard|              completed|        69|           90|           88|
|female|       group B|    standard

In [37]:
studentsdfmyschema.columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

In [39]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
StructField("first_name", StringType(), True),
StructField("last_name", StringType(), True),
StructField("age", LongType(), False)
])
myRow = Row("siva", "selvam", 30)
myDf = spark.createDataFrame([myRow], myManualSchema)
myDf.show()

+----------+---------+---+
|first_name|last_name|age|
+----------+---------+---+
|      siva|   selvam| 30|
+----------+---------+---+



In [41]:
studentsdfmyschema.selectExpr("count('math score')").show()

+-----------------+
|count(math score)|
+-----------------+
|             1001|
+-----------------+



In [58]:
from pyspark.sql.functions import col
studentsdfmyschema.where(col("math score") < 90).show()
#studentsdfmyschema = studentsdfmyschema.filter(" <> ")
#studentsdfmyschema.show(5)

+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+



In [67]:
cnt = spark.sql("SELECT count(*) FROM studentsql where gender='male'" )
cnt.show()

+--------+
|count(1)|
+--------+
|     482|
+--------+



In [69]:
studentsdfmyschema.sort("gender").show()

+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+



In [71]:
netflix = spark.read.csv("../data/netflix_titles.csv",header='true')
netflix.show(5)

+-------+-------+-----+-----------------+--------------------+-------------+-----------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|title|         director|                cast|      country|       date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+-----+-----------------+--------------------+-------------+-----------------+------------+------+---------+--------------------+--------------------+
|     s1|TV Show|   3%|             null|João Miguel, Bian...|       Brazil|  August 14, 2020|        2020| TV-MA|4 Seasons|International TV ...|In a future where...|
|     s2|  Movie| 7:19|Jorge Michel Grau|Demián Bichir, Hé...|       Mexico|December 23, 2016|        2016| TV-MA|   93 min|Dramas, Internati...|After a devastati...|
|     s3|  Movie|23:59|     Gilbert Chan|Tedd Chan, Stella...|    Singapore|December 20, 2018|        2011|     R|   78 min|Horror Movies, In...|When an army recr...

In [72]:
netflix.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [75]:
netflixview= netflix.createOrReplaceTempView("netflixview")
spark.sql("select title, rating, release_year from netflixview where release_year = 2020").show(5)

+--------------------+------+------------+
|               title|rating|release_year|
+--------------------+------+------------+
|                  3%| TV-MA|        2020|
|​SAINT SEIYA: Kni...| TV-14|        2020|
|            (Un)Well| TV-MA|        2020|
|              #Alive| TV-MA|        2020|
|            #blackAF| TV-MA|        2020|
+--------------------+------+------------+
only showing top 5 rows



In [78]:
netflix.rdd.getNumPartitions()

1

In [77]:
netflix.repartition(3)
netflix.rdd.getNumPartitions()

1

In [107]:
netflix.where(col("release_year")==2020).where(col("director").isNotNull()).select("title","director","country","release_year").show(5)


+-------------------+----------------+-----------+------------+
|              title|        director|    country|release_year|
+-------------------+----------------+-----------+------------+
|             #Alive|          Cho Il|South Korea|        2020|
|   #cats_the_mewvie|Michael Margolis|     Canada|        2020|
|#FriendButMarried 2|   Rako Prijanto|  Indonesia|        2020|
|            Òlòtūré|   Kenneth Gyang|    Nigeria|        2020|
|         100% Halal|   Jastis Arimba|  Indonesia|        2020|
+-------------------+----------------+-----------+------------+
only showing top 5 rows



In [114]:
from pyspark.sql.functions import instr
releaseYearFilter = col("release_year") == 2020
directorFilter = instr(col("director"), "Kenneth") >= 1

netflix\
.where(col("director").isNotNull())\
.withColumn("directYear", (releaseYearFilter | directorFilter))\
.where("directYear")\
.select("director", "country","release_year").show(5)

+----------------+-----------+------------+
|        director|    country|release_year|
+----------------+-----------+------------+
|          Cho Il|South Korea|        2020|
|Michael Margolis|     Canada|        2020|
|   Rako Prijanto|  Indonesia|        2020|
|   Kenneth Gyang|    Nigeria|        2020|
|   Jastis Arimba|  Indonesia|        2020|
+----------------+-----------+------------+
only showing top 5 rows



In [131]:
from pyspark.sql.functions import lit, round, bround
netflix.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [132]:
from pyspark.sql.functions import lit, round, bround
netflix.select(round(lit("2.5")), bround(lit("2.5"))).show(2)


+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [133]:
netflix.describe().show()

+-------+--------------------+-------------+---------------------------------+--------------------+--------------------+----------------+---------------+------------------+-----------------+-------------+---------------+--------------------+
|summary|             show_id|         type|                            title|            director|                cast|         country|     date_added|      release_year|           rating|     duration|      listed_in|         description|
+-------+--------------------+-------------+---------------------------------+--------------------+--------------------+----------------+---------------+------------------+-----------------+-------------+---------------+--------------------+
|  count|                7789|         7788|                             7787|                5398|                7070|            7280|           7777|              7787|             7780|         7787|           7786|                7786|
|   mean|                null|  

## String Function

In [136]:
from pyspark.sql.functions import initcap,upper,lower
netflix.select(initcap(col("description"))).show(6)

+--------------------+
|initcap(description)|
+--------------------+
|In A Future Where...|
|After A Devastati...|
|When An Army Recr...|
|In A Postapocalyp...|
|A Brilliant Group...|
|A Genetics Profes...|
+--------------------+
only showing top 6 rows



In [138]:
netflix.select(col("description"), upper(col("description")),lower(col("description"))).show(6)

+--------------------+--------------------+--------------------+
|         description|  upper(description)|  lower(description)|
+--------------------+--------------------+--------------------+
|In a future where...|IN A FUTURE WHERE...|in a future where...|
|After a devastati...|AFTER A DEVASTATI...|after a devastati...|
|When an army recr...|WHEN AN ARMY RECR...|when an army recr...|
|In a postapocalyp...|IN A POSTAPOCALYP...|in a postapocalyp...|
|A brilliant group...|A BRILLIANT GROUP...|a brilliant group...|
|A genetics profes...|A GENETICS PROFES...|a genetics profes...|
+--------------------+--------------------+--------------------+
only showing top 6 rows



In [140]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
ltrim(lit(" HELLO ")).alias("ltrim"),
rtrim(lit(" HELLO ")).alias("rtrim"),
trim(lit(" HELLO ")).alias("trim"),
lpad(lit("HELLO"), 3, " ").alias("lp"),
rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)

+------+------+-----+---+----------+
| ltrim| rtrim| trim| lp|        rp|
+------+------+-----+---+----------+
|HELLO | HELLO|HELLO|HEL|HELLO     |
|HELLO | HELLO|HELLO|HEL|HELLO     |
+------+------+-----+---+----------+
only showing top 2 rows



In [145]:
maxSql = spark.sql("""
SELECT  release_year,count(*) as total_movie_count
FROM netflixview
GROUP BY release_year
""")
maxSql.show()

totSql = spark.sql("""SELECT  count(*) as total_movies FROM netflixview""")
totSql.show()



+-----------------+-----------------+
|     release_year|total_movie_count|
+-----------------+-----------------+
|             2020|              867|
|             2008|              125|
|             2019|              992|
|             1997|               30|
|             2015|              540|
|             2012|              219|
|             2000|               34|
|             2003|               49|
|             1980|                9|
|             1991|               17|
|             1992|               18|
|             2021|               30|
|             1967|                5|
|             1968|                5|
| Marquell Manning|                1|
|             1987|                7|
| Kristen Johnston|                1|
|             1942|                2|
|             1963|                2|
|             2017|             1010|
+-----------------+-----------------+
only showing top 20 rows

+------------+
|total_movies|
+------------+
|        7789|
+-

In [146]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())

dateDF.createOrReplaceTempView("dateTable")
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [147]:
dateDF.show()

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2021-05-14|2021-05-14 19:53:...|
|  1|2021-05-14|2021-05-14 19:53:...|
|  2|2021-05-14|2021-05-14 19:53:...|
|  3|2021-05-14|2021-05-14 19:53:...|
|  4|2021-05-14|2021-05-14 19:53:...|
|  5|2021-05-14|2021-05-14 19:53:...|
|  6|2021-05-14|2021-05-14 19:53:...|
|  7|2021-05-14|2021-05-14 19:53:...|
|  8|2021-05-14|2021-05-14 19:53:...|
|  9|2021-05-14|2021-05-14 19:53:...|
+---+----------+--------------------+



In [151]:
from pyspark.sql.functions import coalesce
netflix.select(coalesce(col("show_id"))).show()

+-----------------+
|coalesce(show_id)|
+-----------------+
|               s1|
|               s2|
|               s3|
|               s4|
|               s5|
|               s6|
|               s7|
|               s8|
|               s9|
|              s10|
|              s11|
|              s12|
|              s13|
|              s14|
|              s15|
|              s16|
|              s17|
|              s18|
|              s19|
|              s20|
+-----------------+
only showing top 20 rows



In [152]:
import pandas as pd
df = pd.DataFrame({"first":range(200), "second":range(50,250)})

sparkDF = spark.createDataFrame(df)


newPDF = sparkDF.toPandas()
newPDF.head()


Unnamed: 0,first,second
0,0,50
1,1,51
2,2,52
3,3,53
4,4,54


In [158]:
#netflix.show(10)
netflix.na.drop("any")
netflix.show(10)

+-------+-------+-----+-----------------+--------------------+-------------+-----------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|title|         director|                cast|      country|       date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+-----+-----------------+--------------------+-------------+-----------------+------------+------+---------+--------------------+--------------------+
|     s1|TV Show|   3%|             null|João Miguel, Bian...|       Brazil|  August 14, 2020|        2020| TV-MA|4 Seasons|International TV ...|In a future where...|
|     s2|  Movie| 7:19|Jorge Michel Grau|Demián Bichir, Hé...|       Mexico|December 23, 2016|        2016| TV-MA|   93 min|Dramas, Internati...|After a devastati...|
|     s3|  Movie|23:59|     Gilbert Chan|Tedd Chan, Stella...|    Singapore|December 20, 2018|        2011|     R|   78 min|Horror Movies, In...|When an army recr...

In [159]:
from pyspark.sql.functions import struct
complexDF = netflix.select(struct("Description", "title").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [160]:
spark.sql("select * from complexDF").show(5)

+--------------------+
|             complex|
+--------------------+
|{In a future wher...|
|{After a devastat...|
|{When an army rec...|
|{In a postapocaly...|
|{A brilliant grou...|
+--------------------+
only showing top 5 rows



In [161]:
from pyspark.sql.functions import split
netflix.select(split(col("Description"), " ")).show(2)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [In, a, future, w...|
|     [After, a, devast...|
+-------------------------+
only showing top 2 rows



In [167]:
from pyspark.sql.functions import from_json,to_json
from pyspark.sql.types import *

parseSchema = StructType((
StructField("show_id",StringType(),True),
StructField("Description",StringType(),True)))

netflix.selectExpr("(show_id, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(5)

+--------------------+--------------------+
|  from_json(newJSON)|             newJSON|
+--------------------+--------------------+
|{s1, In a future ...|{"show_id":"s1","...|
|{s2, After a deva...|{"show_id":"s2","...|
|{s3, When an army...|{"show_id":"s3","...|
|{s4, In a postapo...|{"show_id":"s4","...|
|{s5, A brilliant ...|{"show_id":"s5","...|
+--------------------+--------------------+
only showing top 5 rows



In [166]:
pd.set_option('display.expand_frame_repr', False)

In [170]:
netflix.limit(5).toPandas().head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...
