# Spark

## How to install

    pip install pyspark

## Spark Manager

    localhost:4040

## How to run

    pyspark

## Spark The Definitive Guide

https://github.com/databricks/Spark-The-Definitive-Guide


## Databrics Sertification

In [8]:
###### from IPython.display import IFrame
IFrame("databricks_sertification.pdf", width=1100, height=700)

## Create Spark Session

In [4]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .master("local")
         .appName("Spark session")
         .getOrCreate())

## Read CSV

In [9]:
flight_data =  (spark.read
                .option("inferShema", "true")
                .option("header", "true")
                .csv("data/2010-summary.csv"))

## Basic Operations

In [13]:
# show
flight_data.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [14]:
# show 10 rows
flight_data.take(10)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count='264'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count='69'),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count='24'),
 Row(DEST_COUNTRY_NAME='Equatorial Guinea', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count='25'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Grenada', count='54'),
 Row(DEST_COUNTRY_NAME='Costa Rica', ORIGIN_COUNTRY_NAME='United States', count='477'),
 Row(DEST_COUNTRY_NAME='Senegal', ORIGIN_COUNTRY_NAME='United States', count='29'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Marshall Islands', count='44')]

In [11]:
# explain
flight_data.explain()

== Physical Plan ==
*(1) FileScan csv [DEST_COUNTRY_NAME#23,ORIGIN_COUNTRY_NAME#24,count#25] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/esn/repos/awesomepy-notebooks/spark/data/2010-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:string>


In [16]:
# sort
flight_data.sort("count").take(10)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Algeria', count='1'),
 Row(DEST_COUNTRY_NAME='Malaysia', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='Azerbaijan', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='Equatorial Guinea', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='Liberia', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='Saint Vincent and the Grenadines', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Vietnam', count='1'),
 Row(DEST_COUNTRY_NAME='Slovakia', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Bosnia and Herzegovina', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Serbia', count='1')]

In [17]:
# convert df to table to use the normal sql
flight_data.createOrReplaceTempView("flight_data_table")

In [36]:
# how to make select operation
spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1) as count
FROM flight_data_table
GROUP BY DEST_COUNTRY_NAME
ORDER BY count DESC
""").show()

+--------------------+-----+
|   DEST_COUNTRY_NAME|count|
+--------------------+-----+
|       United States|  131|
|British Virgin Is...|    1|
|              Russia|    1|
|            Paraguay|    1|
|             Senegal|    1|
|              Sweden|    1|
|            Kiribati|    1|
|              Guyana|    1|
|         Philippines|    1|
|            Malaysia|    1|
|           Singapore|    1|
|                Fiji|    1|
|              Turkey|    1|
|             Germany|    1|
|         Afghanistan|    1|
|              Jordan|    1|
|               Palau|    1|
|Turks and Caicos ...|    1|
|              France|    1|
|              Greece|    1|
+--------------------+-----+
only showing top 20 rows



In [37]:
# the same but using df
from pyspark.sql.functions import desc
flight_data.groupBy("DEST_COUNTRY_NAME").count().sort(desc("count")).show()

+--------------------+-----+
|   DEST_COUNTRY_NAME|count|
+--------------------+-----+
|       United States|  131|
|British Virgin Is...|    1|
|              Russia|    1|
|            Paraguay|    1|
|             Senegal|    1|
|              Sweden|    1|
|            Kiribati|    1|
|              Guyana|    1|
|         Philippines|    1|
|            Malaysia|    1|
|           Singapore|    1|
|                Fiji|    1|
|              Turkey|    1|
|             Germany|    1|
|         Afghanistan|    1|
|              Jordan|    1|
|               Palau|    1|
|Turks and Caicos ...|    1|
|              France|    1|
|              Greece|    1|
+--------------------+-----+
only showing top 20 rows



In [40]:
# SELECT col1, func(col1) as new_col
# FROM table1
# WHERE expr
# JOIN table2
# ON table1.col1 = table2.col1
# GROUP BY col1
# ORDER BY new_col
# HAVING expr

In [71]:
values1 = [(1, 'sport'),(2, 'casual'),(3, 'urban'),(4, 'modern'), (5, 'modern')]
df1 = spark.createDataFrame(values1,['id', 'style'])
 
values2 = [(2, 'jeans'),(3, 't-shirt'),(4, 'shoues'),(5, 'glasses')]
df2 = spark.createDataFrame(values2,['id', 'assortment'])

In [54]:
inner_join = df1.join(df2, df1.id == df2.id)
inner_join.select([df1.id, 'style', 'assortment']).show()

+---+------+----------+
| id| style|assortment|
+---+------+----------+
|  3| urban|     jeans|
|  4|modern|   t-shirt|
+---+------+----------+



In [64]:
(df1
 .join(df2, df1.id == df2.id)
 .select([df1.id, 'style', 'assortment'])
 .where(df1.style != 'urban')
 .show())

+---+------+----------+
| id| style|assortment|
+---+------+----------+
|  5|modern|   glasses|
|  2|casual|     jeans|
|  4|modern|    shoues|
+---+------+----------+



In [98]:
from pyspark.sql.functions import *

# SELECT 
# df1.style as sel_style,
# count(df1.style) as count
# FROM df1
# JOIN df2
# ON df1.id == df2.id 
# GROUP BY style
# ORDER BY style DESC
# WHERE df1.style <> 'urban'
# HANVING count > 1


(df1                                   
 .join(df2, df1.id == df2.id)          
 .groupBy('style')                     
 .agg(count('style').alias('count'))  # same as .count()
 .filter(column('count') > 1)  
 .withColumnRenamed('style', 'sel_style')
 .where(df1.style != 'urban')
 .sort(desc('count'))
 # .explain()
 .show())

+---------+-----+
|sel_style|count|
+---------+-----+
|   modern|    2|
+---------+-----+

