<a href="https://colab.research.google.com/github/sasansharifipour/Spark_Class/blob/main/Spark_SQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

import findspark
findspark.init()

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("pySpark Sql basic example")\
        .getOrCreate()

In [9]:
df = spark.read.json('/content/spark-3.0.1-bin-hadoop2.7/examples/src/main/resources/people.json')
df

DataFrame[age: bigint, name: string]

In [10]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [5]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [None]:
df.select('name')

DataFrame[name: string]

In [None]:
df.select('name').show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [None]:
df.select('name', 'age').show()

+-------+----+
|   name| age|
+-------+----+
|Michael|null|
|   Andy|  30|
| Justin|  19|
+-------+----+



In [None]:
df.select('age', 'name').show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
df.select('*').show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
df.select(df['name']).show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [None]:
df.select(df['name'], df['age'] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [None]:
df_filtered = df.filter(df['age'] > 19)
df_filtered

DataFrame[age: bigint, name: string]

In [None]:
df_filtered.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [None]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
number_df = spark.range(500).toDF('number')
number_df.show()

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows



In [None]:
number_df.select(number_df['number'] + 10).show()

+-------------+
|(number + 10)|
+-------------+
|           10|
|           11|
|           12|
|           13|
|           14|
|           15|
|           16|
|           17|
|           18|
|           19|
|           20|
|           21|
|           22|
|           23|
|           24|
|           25|
|           26|
|           27|
|           28|
|           29|
+-------------+
only showing top 20 rows



In [None]:
number_df.filter(number_df['number'] % 2 == 0).show()

+------+
|number|
+------+
|     0|
|     2|
|     4|
|     6|
|     8|
|    10|
|    12|
|    14|
|    16|
|    18|
|    20|
|    22|
|    24|
|    26|
|    28|
|    30|
|    32|
|    34|
|    36|
|    38|
+------+
only showing top 20 rows



In [None]:
number_df.rdd.count()

500

In [None]:
number_df.rdd.take(10)

[Row(number=0),
 Row(number=1),
 Row(number=2),
 Row(number=3),
 Row(number=4),
 Row(number=5),
 Row(number=6),
 Row(number=7),
 Row(number=8),
 Row(number=9)]

In [None]:
# create a row object
spark.range(10).collect()

[Row(id=0),
 Row(id=1),
 Row(id=2),
 Row(id=3),
 Row(id=4),
 Row(id=5),
 Row(id=6),
 Row(id=7),
 Row(id=8),
 Row(id=9)]

In [None]:
# create a row
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)
type(myRow)

pyspark.sql.types.Row

In [None]:
# return first element
myRow[0]

'Hello'

In [None]:
myRow[2]

1

In [None]:
myRow

<Row('Hello', None, 1, False)>

In [12]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
	StructField("name", StringType(), True),
	StructField("family", StringType(), True),
	StructField("age", LongType(), False)
])

In [13]:
myManualSchema

StructType(List(StructField(name,StringType,true),StructField(family,StringType,true),StructField(age,LongType,false)))

In [14]:
myRow = Row("Hello", None, 1)

In [15]:
myRow

<Row('Hello', None, 1)>

In [16]:
myDf = spark.createDataFrame([myRow], myManualSchema)

In [17]:
myDf

DataFrame[name: string, family: string, age: bigint]

In [18]:
myDf.show()

+-----+------+---+
| name|family|age|
+-----+------+---+
|Hello|  null|  1|
+-----+------+---+



In [19]:
# Example of using Row function to create a dataframe
from pyspark.sql import Row
cats = Row("Name", "Nickname", "Location", "Treat")

In [20]:
cats

<Row('Name', 'Nickname', 'Location', 'Treat')>

In [21]:
cat1 = Row('Dakota', 'Sweetie', 'house', 'salmon')
cat2 = Row('George', 'Grumpy', 'apt', 'liver')
cat3 = Row('Karrot', 'Biggiek', 'condo', 'chicken')
cat4 = Row('Tigress', 'Claw', 'street', 'trout')
cat5 = Row('Kitty', 'Meow', 'house', 'salmon')

In [22]:
print(cat3)

<Row('Karrot', 'Biggiek', 'condo', 'chicken')>


In [23]:
# Create Row elements

shelter1 = Row(id='1', name='CatColony')
shelter2 = Row(id='2', name='Mauhaus')
shelter3 = Row(id='3', name='BigCatHouse')
shelter4 = Row(id='4', name='WindowCats')

In [24]:
shelter2

Row(id='2', name='Mauhaus')

In [27]:
# Create Row elements

shelterWithCats1 = Row(shelter = shelter1, cats=[cat1, cat2])
shelterWithCats2 = Row(shelter = shelter2, cats=[cat3, cat4])
shelterWithCats3 = Row(shelter = shelter3, cats=[cat5, cat4, cat1])
shelterWithCats4 = Row(shelter = shelter4, cats=[cat2, cat3])

In [28]:
shelterWithCats = [shelterWithCats1, shelterWithCats2, shelterWithCats3, shelterWithCats4]

In [29]:
dframe = spark.createDataFrame(shelterWithCats)

In [30]:
dframe.show()

+----------------+--------------------+
|         shelter|                cats|
+----------------+--------------------+
|  [1, CatColony]|[[Dakota, Sweetie...|
|    [2, Mauhaus]|[[Karrot, Biggiek...|
|[3, BigCatHouse]|[[Kitty, Meow, ho...|
| [4, WindowCats]|[[George, Grumpy,...|
+----------------+--------------------+



In [34]:
spark.stop()