# "Pyspark quickstart"
> "pyspark quickstart"

- toc:true
- branch: master
- badges: true
- comments: true
- categories: [computing]
- tags: [spark, python]

## Creating RDD

In [5]:
# from list
parallelRdd = sc.parallelize([1,2,3,4,5])
parallelRdd.collect()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[1, 2, 3, 4, 5]

<IPython.core.display.Javascript object>

In [6]:
# from tuple
rdd = sc.parallelize(('a','b','c'))
rdd.collect()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

['a', 'b', 'c']

<IPython.core.display.Javascript object>

In [7]:
# from list
rdd = sc.parallelize(['a','b','c','d'])
rdd.collect()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

['a', 'b', 'c', 'd']

<IPython.core.display.Javascript object>

In [8]:
# from set
rdd = sc.parallelize({'a','b','c','d'})
rdd.collect()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

['c', 'd', 'b', 'a']

<IPython.core.display.Javascript object>

In [10]:
# from dict
rdd = sc.parallelize(
{
    'a' : 1,
    'b' : 2,
    'c' : 3
})

rdd.collect()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

['a', 'b', 'c']

<IPython.core.display.Javascript object>

As you see only keys are used in the dictionary to form RDD

In [None]:
# read from csv file
rdd = sc.textFile('')
rdd.take(2)

In [18]:
# create empty rdd
rdd = sc.emptyRDD()
rdd.isEmpty()

0,1,2
▸,:,


True

In [25]:
# from namedtuple
from collections import namedtuple
Person = namedtuple("Person", "id firstName lastName")
jon = Person(1, "Jon", "Doe")
jane = Person(2, "Jane", "Doe")
rdd = sc.parallelize([jon, jane])
rdd.collect()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

[Person(id=1, firstName='Jon', lastName='Doe'),
 Person(id=2, firstName='Jane', lastName='Doe')]

<IPython.core.display.Javascript object>

### Test Rdd Functions

In [20]:
# Histogram
rdd = sc.parallelize([1,2,3,4,5])
rdd.histogram([0,10][1])

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

([1.0,
  1.4,
  1.8,
  2.2,
  2.6,
  3.0,
  3.4000000000000004,
  3.8000000000000003,
  4.2,
  4.6,
  5],
 [1, 0, 1, 0, 0, 1, 0, 1, 0, 1])

<IPython.core.display.Javascript object>

In [21]:
# sum
rdd.sum()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

15

<IPython.core.display.Javascript object>

In [26]:
rdd.distinct().count()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

2

<IPython.core.display.Javascript object>

In [27]:
# convert rdd to pair rdd using map
sc.parallelize([1,2,3,4,5,6]).map(lambda x: (x%3, x)).collect()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

[(1, 1), (2, 2), (0, 3), (1, 4), (2, 5), (0, 6)]

<IPython.core.display.Javascript object>

In [29]:
# take
sc.parallelize([1,2,3,4,5,6]).first()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

1

<IPython.core.display.Javascript object>

# DataFrame's

### create dataframe by reading csv file

In [None]:
csvData = spark.read.csv( path='',
                         sep=',',
                         encoding='UTF-8',
                         comment=None,
                         header=True,
                         inferSchema=True)

csvData.show(n=5, truncate=False)

### create dataframe by createDataFrame function

In [76]:
rdd = spark.createDataFrame(
    [
        (1, 'foo'), 
        (2, 'bar'),
    ],
    ['id', 'txt'] # schema of columns here
)

rdd.show(2)

0,1,2
▸,:,


<IPython.core.display.Javascript object>

+---+---+
| id|txt|
+---+---+
|  1|foo|
|  2|bar|
+---+---+



<IPython.core.display.Javascript object>

## change column names

#### Option-1

In [66]:
from pyspark.sql.functions import col
rdd.selectExpr("id as emp_id", "txt as name").printSchema()


0,1,2
▸,:,


root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)



#### Option-2

In [75]:
from pyspark.sql.functions import col

rdd.select(col("id").alias("emp_id"), col("txt").alias("name")).printSchema()

0,1,2
▸,:,


root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)



#### Option-3

In [78]:
rdd.registerTempTable("rdd")
output = spark.sql("SELECT id AS emp_id, txt as Name from rdd")
output.printSchema()

0,1,2
▸,:,


root
 |-- emp_id: long (nullable = true)
 |-- Name: string (nullable = true)



## Group by and Aggregate

In [38]:
rdd = spark.createDataFrame(
    [
        ('GOOG', 1, 200000), 
        ('GOOG', 2, 150000),
        ('AAPL', 3, 175000),
        ('AAPL', 4, 180000)
    ],
    ['company', 'emp_id', 'salary'] # schema of columns here
)

rdd.show()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

+-------+------+------+
|company|emp_id|salary|
+-------+------+------+
|   GOOG|     1|200000|
|   GOOG|     2|150000|
|   AAPL|     3|175000|
|   AAPL|     4|180000|
+-------+------+------+



<IPython.core.display.Javascript object>

In [42]:
rdd.groupBy('company')

0,1,2
▸,:,


<pyspark.sql.group.GroupedData at 0x7f351e56d278>

In [44]:
rdd.groupBy('company').max().show()

## you see if gave max values for a company for both columns

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

+-------+-----------+-----------+
|company|max(emp_id)|max(salary)|
+-------+-----------+-----------+
|   AAPL|          4|     180000|
|   GOOG|          2|     200000|
+-------+-----------+-----------+



<IPython.core.display.Javascript object>

In [45]:
rdd.groupBy('company').max('salary').show()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

+-------+-----------+
|company|max(salary)|
+-------+-----------+
|   AAPL|     180000|
|   GOOG|     200000|
+-------+-----------+



<IPython.core.display.Javascript object>

In [53]:
# Sum by Aggregate

grouped_rdd = rdd.groupBy("company")
grouped_rdd.agg({'salary':'avg'}).show()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

+-------+-----------+
|company|avg(salary)|
+-------+-----------+
|   AAPL|   177500.0|
|   GOOG|   175000.0|
+-------+-----------+



<IPython.core.display.Javascript object>

In [55]:
rdd.groupBy("company") \
.count() \
.orderBy("count", ascending=False) \
.show(5)

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

+-------+-----+
|company|count|
+-------+-----+
|   AAPL|    2|
|   GOOG|    2|
+-------+-----+



<IPython.core.display.Javascript object>

## Order by

In [39]:
rdd.orderBy('salary').show()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

+-------+------+------+
|company|emp_id|salary|
+-------+------+------+
|   GOOG|     2|150000|
|   AAPL|     3|175000|
|   AAPL|     4|180000|
|   GOOG|     1|200000|
+-------+------+------+



<IPython.core.display.Javascript object>

In [41]:
rdd.orderBy(rdd["salary"].desc()).show()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

+-------+------+------+
|company|emp_id|salary|
+-------+------+------+
|   GOOG|     1|200000|
|   AAPL|     4|180000|
|   AAPL|     3|175000|
|   GOOG|     2|150000|
+-------+------+------+



<IPython.core.display.Javascript object>

## Adding & Dropping Columns

In [60]:
from pyspark.sql.types import DoubleType

addCol = rdd.withColumn("doubleSalary", rdd['salary'].cast(DoubleType()))

addCol.printSchema()

0,1,2
▸,:,


root
 |-- company: string (nullable = true)
 |-- emp_id: long (nullable = true)
 |-- salary: long (nullable = true)
 |-- doubleSalary: double (nullable = true)



In [61]:
addCol.drop("doubleSalary").printSchema()

0,1,2
▸,:,


root
 |-- company: string (nullable = true)
 |-- emp_id: long (nullable = true)
 |-- salary: long (nullable = true)



## Joins