<a href="https://colab.research.google.com/github/priyadharshini13/pyspark/blob/main/PySpark_LearningSpark_Ch3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 35 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 59.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=c8f3f9eb5005cb622c4dd496be070f1edcd790565ad970474472d6d906e31532
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [4]:
# Import Sparksession
from pyspark.sql import SparkSession

# create spark session
spark = SparkSession.builder.appName('LearningPySpark').master('local[4]').getOrCreate()

In [5]:
df = spark.read.text('/content/sample_data/README.md')
df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                    |
+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|This directory includes a few sample datasets to get you started.                                                                                        |
|                                                                                                                                                         |
|*   `california_housing_data*.csv` is California housing data from the 1990 US                                                                           |
|    Census; more information is available at:                  

In [6]:
df.count()

19

In [7]:
st = df.filter(df.value.contains('.csv'))
print(st.count())

st1 = df.filter(df.value.contains('*'))
print(st1.count())

2
3


In [8]:
# 1. Defining schemas in dataframe
# two ways
  # 1. programatic
  # 2. DDL(Using Data Definition language)

# 1. programatic
from pyspark.sql.types import * 
schema = StructType([StructField('author', StringType(), False), StructField('title', StringType(), False),
                     StructField('pages', IntegerType(), False)])

# 2. Using DDL
schema1 = 'author STRING, title STRING, pages INT'

# By default, Spark infers the schema from the data

# Define schema for our data using DDL
schema_ddl = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING,`Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"
# Create our static data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIn"]],
        [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter",
        "LinkedIn"]],
        [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web",
        "twitter", "FB", "LinkedIn"]],
        [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,
        ["twitter", "FB"]],
        [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web",
        "twitter", "FB", "LinkedIn"]],
        [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,
        ["twitter", "LinkedIn"]]
        ]

blogs = spark.createDataFrame(data, schema_ddl)
# blogs.show()
# df.show(1, vertical=True)

blogs.show(vertical = True)

-RECORD 0-------------------------
 Id        | 1                    
 First     | Jules                
 Last      | Damji                
 Url       | https://tinyurl.1    
 Published | 1/4/2016             
 Hits      | 4535                 
 Campaigns | [twitter, LinkedIn]  
-RECORD 1-------------------------
 Id        | 2                    
 First     | Brooke               
 Last      | Wenig                
 Url       | https://tinyurl.2    
 Published | 5/5/2018             
 Hits      | 8908                 
 Campaigns | [twitter, LinkedIn]  
-RECORD 2-------------------------
 Id        | 3                    
 First     | Denny                
 Last      | Lee                  
 Url       | https://tinyurl.3    
 Published | 6/7/2019             
 Hits      | 7659                 
 Campaigns | [web, twitter, FB... 
-RECORD 3-------------------------
 Id        | 4                    
 First     | Tathagata            
 Last      | Das                  
 Url       | https:/

In [9]:
# https://spark.apache.org/docs/3.2.0/api/python/getting_started/quickstart_df.html
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.maxNumRows', 3)

In [10]:
blogs.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [11]:
# check the definition of dataframe
blogs.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



# DATAFRAMES
Dataframe can be created  
1. from pandas
2. using RDD tuples- prarllelize
3. using rows - Rows(1, ..)
4. Defining schema
    1. using ddl = 'a STRING, b INT'
    2. programatic = StructType(StructField(

In [12]:
# Data
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
rdd = spark.sparkContext.parallelize(data)

# 1. From RDD
# 1.1 from existing RDD
  # Without column names
df = rdd.toDF()
print('Pyspark dataframe from RDD without column names')
df.show()
df.printSchema()
    # _1|    _2|
    # Default column name if column name are not specified.
print('*****************************************************')
df1 = rdd.toDF(columns)
print('Pyspark dataframe from RDD with column names')
df1.show()
df1.printSchema()

Pyspark dataframe from RDD without column names
+------+------+
|    _1|    _2|
+------+------+
|  Java| 20000|
|Python|100000|
| Scala|  3000|
+------+------+

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)

*****************************************************
Pyspark dataframe from RDD with column names
+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+

root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)



In [13]:
# 1.2  from existing RDD using createDataframe from sparksession
df_frmCreateDF = spark.createDataFrame(rdd).toDF(*columns)
df_frmCreateDF.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [14]:
# 2. Using List collection
# 2.1 - createDataFrame(list).toDF
df_frmList = spark.createDataFrame(data).toDF(*columns)
df_frmList.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [15]:
# 2.2 - Using Row Type
from pyspark.sql import Row
row_data = map(lambda x: Row(*x), data)
df_frmList_2 = spark.createDataFrame(row_data, columns)
df_frmList_2.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [16]:
# 2.3 Using schema
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

data2 = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df_frmSchema = spark.createDataFrame(data2, schema)
df_frmSchema.printSchema()
df_frmSchema.show(truncate= False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



In [17]:
import pandas as pd

url = 'https://raw.githubusercontent.com/databricks/LearningSparkV2/master/chapter3/data/sf-fire-calls.csv'
df = pd.read_csv(url, index_col=0)
print(df.head(5))

           UnitID  IncidentNumber  ...          RowID     Delay
CallNumber                         ...                         
20110016      T13         2003235  ...  020110016-T13  2.950000
20110022      M17         2003241  ...  020110022-M17  4.700000
20110023      M41         2003242  ...  020110023-M41  2.433333
20110032      E11         2003250  ...  020110032-E11  1.500000
20110043      B04         2003259  ...  020110043-B04  3.483333

[5 rows x 27 columns]


  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
# 3. Using datasources
# df_csv = spark.read.option('header', True).option('delimiter',',').csv("https://raw.githubusercontent.com/databricks/LearningSparkV2/master/chapter3/data/sf-fire-calls.csv")
# df_csv.show()
# chapter3/data/sf-fire-calls.csv
# https://raw.githubusercontent.com/databricks/LearningSparkV2/master/chapter3/data/sf-fire-calls.csv

# dff = spark.read.format('csv').option('header', True).load('https://raw.githubusercontent.com/databricks/LearningSparkV2/master/chapter3/data/sf-fire-calls.csv')
# dff.show()

Dec 3, 2021
# Columns and Expression

In [19]:
from pyspark.sql.functions import lit

# 1. Creating column class object using lit()
print('1. Creating column class object using lit()')
column_obj = lit('column_example')
print(column_obj)
print('**********************************')
# 2. Creating column in a dataframe - mentioning the column names in toDF
print('# 2. Creating column in a dataframe - mentioning the column names in toDF---')
data=[("James",23),("Ann",40)]
df = spark.createDataFrame(data).toDF('names.firstname', 'age')
df.printSchema()

1. Creating column class object using lit()
Column<'column_example'>
**********************************
# 2. Creating column in a dataframe - mentioning the column names in toDF---
root
 |-- names.firstname: string (nullable = true)
 |-- age: long (nullable = true)



Accessing columns
  1. Using dataframe object
  2. Using dot operator and backtick
  3. Using col()
  4. Using col(), backtick

In [20]:
from pyspark.sql.functions import col
#   # 1. just passing column name in dataframe object
df.select('age').show()

# 1. Using df object
df.select(df.age).show()
df.select(df['age']).show()

# 2. Using dot operator and backtick
df.select(df['`names.firstname`']).show()

# 3. Using col
df.select(col('age')).show()

# 4. Using col and backtick
df.select(col('`names.firstname`')).show()


+---+
|age|
+---+
| 23|
| 40|
+---+

+---+
|age|
+---+
| 23|
| 40|
+---+

+---+
|age|
+---+
| 23|
| 40|
+---+

+---------------+
|names.firstname|
+---------------+
|          James|
|            Ann|
+---------------+

+---+
|age|
+---+
| 23|
| 40|
+---+

+---------------+
|names.firstname|
+---------------+
|          James|
|            Ann|
+---------------+



Arithmetic operation of column

In [21]:
data=[(100,2,1),(200,3,4),(300,4,4)]
df=spark.createDataFrame(data).toDF("col1","col2","col3")

#Arthmetic operations
df.select((df.col1 + df.col2).alias('Addition(Col1+Col2)')).show()
df.select((df.col2 - df.col1).alias('Subtraction(Col2-col1)')).show()
# df.select(df.col1 + df.col2).show()
# df.select(df.col1 - df.col2).show() 
# df.select(df.col1 * df.col2).show()
# df.select(df.col1 / df.col2).show()
# df.select(df.col1 % df.col2).show()

# df.select(df.col2 > df.col3).show()
# df.select(df.col2 < df.col3).show()
# df.select(df.col2 == df.col3).show()

+-------------------+
|Addition(Col1+Col2)|
+-------------------+
|                102|
|                203|
|                304|
+-------------------+

+----------------------+
|Subtraction(Col2-col1)|
+----------------------+
|                   -98|
|                  -197|
|                  -296|
+----------------------+



Column functions

In [22]:
df.sort(df['col2'].desc()).show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
| 300|   4|   4|
| 200|   3|   4|
| 100|   2|   1|
+----+----+----+



# Row

Different ways of creating Row

1. Using PySpark.sql - Row
    1. With named argument --> Row(id = 1, name="Priya")
    2. Without named argument -->  Row(1, 'Priya')
2. Create Custom class from Row
3. Using Row class on PySpark RDD
4. Using Row class - Dataframe
    1. named argument
    2. unnamed arguments
5. Nested row


Different ways of accessing rows.
1. For accessing rows with unnamed argument - Use index

  r1 = Row(1, 'Priya') ==> row[0] #1, row[1] #Priya

2. For accessing rows with Named argument 
    1. Use key  ==>   print("r1['id']-->", r1['id'])
    2. Use dot operator  ==> print(r2.country.europe)



In [59]:
# Different ways of accessing row
r1 = [Row(1, 'John'), Row(2, 'Smith')]
print(r1)
print(r1[0])
print(r1[1])
print(r1[0][1])
print(r1[1][1])
print('--------------------Named argument access--------------')
r2 = Row(id=1, name='John', country=Row(europe='England',us='Dallas'))
print(r2['id'])
print(r2['name'])
print(r2.country.europe)



[<Row(1, 'John')>, <Row(2, 'Smith')>]
<Row(1, 'John')>
<Row(2, 'Smith')>
John
Smith
--------------------Named argument access--------------
1
John
England


In [28]:
# 1. Using pyspark.sql - Row
from pyspark.sql import Row

#without named argument
blog_row = Row(1, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015",["twitter", "LinkedIn"])
# access row using index
print(blog_row[1])
print(blog_row[-1])
print(blog_row[-1][-1])

# With named argument
blog1_row = Row(id=1, firstname="Reynold")
print('--------------------------------------')
print('Named argument row:')
print('blog1_row----', blog1_row)
print("blog1_row['firstname']----",blog1_row['firstname'])
print('blog1_row.id-----',blog1_row.id)


print('without named argument - ', blog_row)
print('With named argument - ', blog1_row )

Reynold
['twitter', 'LinkedIn']
LinkedIn
--------------------------------------
Named argument row:
blog1_row---- Row(id=1, firstname='Reynold')
blog1_row['firstname']---- Reynold
blog1_row.id----- 1
without named argument -  <Row(1, 'Reynold', 'Xin', 'https://tinyurl.6', 255568, '3/2/2015', ['twitter', 'LinkedIn'])>
With named argument -  Row(id=1, firstname='Reynold')


In [24]:
# 2. Create Custom class from Row
Person = Row("Name", "Age")
p1 = Person("James", 40)
p2 = Person("Robert", 50)
print(p1)
print(p2.Name)

Row(Name='James', Age=40)
Robert


In [25]:
# 3. Using Row class on pyspark RDD
from pyspark.sql import Row

# Method 1
data = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]
rdd = spark.sparkContext.parallelize(data)
print(rdd.collect())
for item in rdd.collect():
  print(item[0])

# Method 2 - using custom class
Person = Row('name', 'country')
data = [Person("Matei Zaharia", "CA"), Person("Reynold Xin", "CA")]
print(data)

[<Row('Matei Zaharia', 'CA')>, <Row('Reynold Xin', 'CA')>]
Matei Zaharia
Reynold Xin
[Row(name='Matei Zaharia', country='CA'), Row(name='Reynold Xin', country='CA')]


In [30]:
# 4. Using Row class on pyspark dataframe

from pyspark.sql import Row
# Unnamed argument
data = [Row(1, 'Priya'), Row(2, 'Dharshini')]
df = spark.createDataFrame(data)

# Named argument
data1 = [Row(id=1, name='Priya'), Row(id=2, name='Dharshini')]
df1 = spark.createDataFrame(data1)


  # using unnamed argument but explicit column name to create dataframe
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]

# Using toDF
# columns = ['Authors', 'Country']
# df2=spark.createDataFrame(data).toDF(*columns)
df2 = spark.createDataFrame(rows,['Authors', 'Country'])

print('Un-named argument')
df.show()
print('Named argument')
df1.show()
print('explicit column name')
df2.show()


Un-named argument
+---+---------+
| _1|       _2|
+---+---------+
|  1|    Priya|
|  2|Dharshini|
+---+---------+

Named argument
+---+---------+
| id|     name|
+---+---------+
|  1|    Priya|
|  2|Dharshini|
+---+---------+

explicit column name
+-------------+-------+
|      Authors|Country|
+-------------+-------+
|Matei Zaharia|     CA|
|  Reynold Xin|     CA|
+-------------+-------+



In [51]:
# 5. Nested struct using rows
data = [Row(id=1, props=Row(firstname='Priya', lastname='Dharshini'))]
df = spark.createDataFrame(data)
df.show()
df.printSchema()
# print(df.first())
for item in df.collect():
  print("item['id']-->", item['id'])
  print("item['props']-->", item['props'])
  print("item['props']['firstname']-->",item['props']['firstname'])
  print(type(item))
  print("item.props-->",item.props)
  print('item.props.lastname-->', item.props.lastname)

+---+------------------+
| id|             props|
+---+------------------+
|  1|{Priya, Dharshini}|
+---+------------------+

root
 |-- id: long (nullable = true)
 |-- props: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)

item['id']--> 1
item['props']--> Row(firstname='Priya', lastname='Dharshini')
item['props']['firstname']--> Priya
<class 'pyspark.sql.types.Row'>
item.props--> Row(firstname='Priya', lastname='Dharshini')
item.props.lastname--> Dharshini


+-------------+-------+
|      Authors|Country|
+-------------+-------+
|Matei Zaharia|     CA|
|  Reynold Xin|     CA|
+-------------+-------+

