In [25]:
import pyspark
import collections
from operator import add
from pyspark.sql import (Row, SparkSession)
from pyspark.sql.functions import (
    avg,
    col,
    round as rnd,
    asc,
    desc,
)

In [None]:
sc = pyspark.SparkContext('local[*]')

RDD : immutable distributed collection of objects

In [3]:
rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)

                                                                                

[816, 823, 678, 238, 491]

In [8]:
test_file = "data/raw/word.txt"
text_file = sc.textFile(test_file)
counts = text_file.flatMap(lambda line : line.split(" ")) \
                .map(lambda word : (word, 1)) \
                .reduceByKey(lambda a, b: a + b)
print(counts.collect())  

[('world', 6), ('hello', 6)]


In [9]:
test_file = "data/raw/grade.txt"
text_file = sc.textFile(test_file)
grade = text_file.map(lambda line: line.split(" ")[1])

# Return the count of each unique value in this RDD as a dictionary of (value, count) pairs.
grade_count = grade.countByValue()
    
for grade, count in sorted(grade_count.items(), key= lambda item: item[1], reverse=True):
    print(f"{grade}: {count}")

90: 2
70: 1
80: 1
100: 1


In [11]:
# creating Key / Value RDD
# reduceByKey(): Merge the values for each key using an associative and commutative reduce function.
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
sorted(rdd.reduceByKey(add).collect())

[('a', 2), ('b', 1)]

In [12]:
# groupByKey(): Group the values for each key in the RDD into a single sequence. Hash-partitions the resulting RDD with numPartitions partitions.
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
sorted(rdd.groupByKey().mapValues(len).collect())

[('a', 2), ('b', 1)]

In [13]:
sorted(rdd.groupByKey().mapValues(list).collect())

[('a', [1, 1]), ('b', [1])]

In [14]:
# sortByKey(): Sorts this RDD, which is assumed to consist of (key, value) pairs.
tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
sc.parallelize(tmp).sortByKey().first()

('1', 3)

In [15]:
# keys(), values(): Create a RDD of keys or just values
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
rdd.keys()

PythonRDD[52] at RDD at PythonRDD.scala:53

In [16]:
# join, rightOuterJoin, leftOuterJoin, cogroup, subtractByKey
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("a", 3)])
sorted(x.join(y).collect())

[('a', (1, 2)), ('a', (1, 3))]

Efficiency is the key for performance!!!

if you only need values, use mapValues() or flatMapValues()

In [18]:
# filter
# Return a new RDD containing only the elements that satisfy a predicate.
test_file = "data/raw/temperature.csv"

def get_data(line, header):
    if line != header:
        col = line.split(',')
        city = col[6].strip("\"")
        avg_temp_fahr = col[4]
        yield (city, avg_temp_fahr)
    
lines = sc.textFile(test_file)

# get header string
header = lines.first()

parsed_line = lines.flatMap(lambda line: get_data(line, header))

# filter NA values
filtered_line = parsed_line.filter(lambda x: "NA" not in x[1])

# finding min temperature
min_temp = filtered_line.reduceByKey(lambda x, y: min(float(x), float(y)))
final_list = min_temp.collect();
for city, temperature in final_list:
    print(f"{city}: {temperature}")

Auckland: 49.856
NA: 12.4682
Johannesburg: 42.1772
Marseille: 39.3908
Odesa: 14.8838
Tottori: 34.2518
Warsaw: 6.8
BrasÃ­lia: 62.9744
Canoas: 50.009
Cape Town: 49.9946
Hamilton: 44.564
Kherson: 7.0952
Kiev: 2.85619999999999
Lvov: 7.1726
Paris: 25.0232
Stockholm: 13.3988
Tokyo: 29.156
Uppsala: 6.0494
Wroclaw: 9.167


In [19]:
test_file = "data/raw/house_price.csv"

def parse_line(line: str):
    city, price, count = line.split(',')
    return (int(price), int(count))

lines = sc.textFile(test_file)
price_count = lines.map(parse_line)
# [(10000, 3), (10000, 5), (40000, 7), (5000, 7), (4000, 2), (9000, 4), (5000, 7), (4000, 2), (8000, 9)]

sum_of_count = price_count.mapValues(lambda count: (count, 1))\
                .reduceByKey(lambda a, b: (int(a[0]) + int(b[0]), int(a[1]) + int(b[1]))) 

# ('10000', (3, 1)), ('10000', (5, 1)) ...
# [('10000', (8, 2)), ('4000', (4, 2)), ('9000', ('4', 1)), ('8000', ('9', 1)), ('40000', ('7', 1)), ('5000', (14, 2))]

avg_by_count = sum_of_count.mapValues(lambda total_count: int(total_count[0]) / total_count[1])
results = avg_by_count.collect()
print(results)

[(10000, 4.0), (40000, 7.0), (5000, 7.0), (4000, 2.0), (9000, 4.0), (8000, 9.0)]


In [21]:
# map vs. flatMap

# map transformation applies a function to each row in a DataFrame/Dataset and returns the new transformed Dataset.
# 1 => 1
# flatMap transformation flattens the DataFrame/Dataset after applying the function on every element and returns a new transformed Dataset. The returned Dataset will return more rows than the current DataFrame. It is also referred to as a one-to-many transformation function
# 1 => Many
# One of the use cases of flatMap() is to flatten column which contains arrays, list, or any nested collection

rdd = sc.parallelize([("name", "joe,sarah,tom"), ("car", "hyundai")])
result = rdd.map(lambda x: x[1].split(","))
print(result.collect())

[['joe', 'sarah', 'tom'], ['hyundai']]


In [22]:
rdd = sc.parallelize([("name", "joe,sarah,tom"), ("car", "hyundai")])
result = rdd.flatMap(lambda x: x[1].split(","))
print(result.collect())

['joe', 'sarah', 'tom', 'hyundai']


In [24]:
def parse_line(line: str):
    fields = line.split('|') # |
    return Row(
        name=str(fields[0]),
        country=str(fields[1]),
        email=str(fields[2]),
        compensation=int(fields[3]))


spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
lines = spark.sparkContext.textFile("data/raw/income.txt")
income_data = lines.map(parse_line)

# Creates a DataFrame from an RDD, a list or a pandas.DataFrame.
# SparkSession.createDataFrame(data, schema=None, samplingRatio=None, verifySchema=True)[source]
schema_income = spark.createDataFrame(data=income_data).cache()

# Creates or replaces a local temporary view with this DataFrame.
schema_income.createOrReplaceTempView("income")

# returns the dataframe
medium_income_df = spark.sql(
    "SELECT * FROM income WHERE compensation >= 70000 AND compensation <= 100000")
medium_income_df.show()

                                                                                

+------------------+--------------------+--------------------+------------+
|              name|             country|               email|compensation|
+------------------+--------------------+--------------------+------------+
|  Willian Cummings|             Senegal|    areus@test.canon|       77369|
|      Clarita Gill|             Ecuador| tomaslau@test.games|       86986|
| Walter Washington|          Kazakhstan|mbilderbach@examp...|       91072|
|       Lexie Banks|                Mali|unterdreht@test.date|       97933|
|        Luise Hunt|               Kenya|adellecharles@tes...|       96175|
|     Sebrina Walsh|         Puerto Rico|andrewcohen@examp...|       99276|
|      Josiah Lyons|              Malawi|nandini_m@test.ry...|       91768|
|      Temeka Grant|              Israel|terryxlife@test.g...|       71642|
|  Narcisa Saunders|Palestinian Terri...|raquelwilson@exam...|       77287|
|      Lisbeth Lane|          Azerbaijan|coreyweb@test.coffee|       82473|
|       Evan

In [27]:
spark = SparkSession.builder.appName("sql_import_csv").getOrCreate()
csv_file_path = "data/raw/age.csv"

# header option: either csv has header or not (default: header = false)
# inferSchema: either all columns are str or not

data = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_file_path)

# show schema
data.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- country: string (nullable = true)



In [28]:
# show column name with data
data.select("name", "age").show()

+-----------------+---+
|             name|age|
+-----------------+---+
|    Neville Hardy| 56|
|      Dacia Cohen| 74|
|    Kathey Daniel| 10|
|     Mallie Welch| 12|
|     Katia Bryant| 14|
|Laurice Robertson| 53|
|     Minh Barrett| 27|
|   Latashia Perez| 52|
|      Elvina Ross| 68|
|  Augustus Snyder| 20|
|        Elois Cox| 65|
|    Jolanda Dixon| 14|
|      Rutha Young| 10|
| Waltraud Holland| 10|
|   Colton Flowers| 77|
|     Meri Hawkins| 43|
|     Theola Mason| 71|
|  Antonia Pearson| 25|
|   Delicia Murray| 41|
|    Cicely Harvey| 37|
+-----------------+---+
only showing top 20 rows



In [29]:
# filter the data for age of 20 above
data.filter(data.age > 20).show()

+-----------------+---+--------------------+
|             name|age|             country|
+-----------------+---+--------------------+
|    Neville Hardy| 56|                Niue|
|      Dacia Cohen| 74|Falkland Islands ...|
|Laurice Robertson| 53|        Saudi Arabia|
|     Minh Barrett| 27|French Southern T...|
|   Latashia Perez| 52|             Finland|
|      Elvina Ross| 68|         New Zealand|
|        Elois Cox| 65|            Paraguay|
|   Colton Flowers| 77|Saint Vincent and...|
|     Meri Hawkins| 43|             Jamaica|
|     Theola Mason| 71|              Gambia|
|  Antonia Pearson| 25|             Namibia|
|   Delicia Murray| 41|         El Salvador|
|    Cicely Harvey| 37|              Belize|
|    Berry Russell| 49|       New Caledonia|
|   Lauryn Hubbard| 80|           Mauritius|
|    Judson Willis| 34|              Sweden|
|     Junita Meyer| 49|             Moldova|
|        Apryl Fox| 48|            Maldives|
|   Dorsey Wheeler| 56|Sao Tome and Prin...|
|       Ja

In [30]:
# group by age and aggregates for count
data.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 31|   16|
| 65|   16|
| 53|   14|
| 78|   16|
| 34|   15|
| 28|   11|
| 76|   14|
| 27|   10|
| 26|   15|
| 44|   14|
| 12|   15|
| 22|   17|
| 47|   17|
| 52|   16|
| 13|   14|
| 16|   18|
| 20|   12|
| 40|   22|
| 57|    9|
| 54|   17|
+---+-----+
only showing top 20 rows



In [31]:
# custom arithmetic
data.select(data.name, data.age, data.age - 10).show()

+-----------------+---+----------+
|             name|age|(age - 10)|
+-----------------+---+----------+
|    Neville Hardy| 56|        46|
|      Dacia Cohen| 74|        64|
|    Kathey Daniel| 10|         0|
|     Mallie Welch| 12|         2|
|     Katia Bryant| 14|         4|
|Laurice Robertson| 53|        43|
|     Minh Barrett| 27|        17|
|   Latashia Perez| 52|        42|
|      Elvina Ross| 68|        58|
|  Augustus Snyder| 20|        10|
|        Elois Cox| 65|        55|
|    Jolanda Dixon| 14|         4|
|      Rutha Young| 10|         0|
| Waltraud Holland| 10|         0|
|   Colton Flowers| 77|        67|
|     Meri Hawkins| 43|        33|
|     Theola Mason| 71|        61|
|  Antonia Pearson| 25|        15|
|   Delicia Murray| 41|        31|
|    Cicely Harvey| 37|        27|
+-----------------+---+----------+
only showing top 20 rows



In [32]:
# column alias
data.select(data.name, col("age").alias("age1")).show()

+-----------------+----+
|             name|age1|
+-----------------+----+
|    Neville Hardy|  56|
|      Dacia Cohen|  74|
|    Kathey Daniel|  10|
|     Mallie Welch|  12|
|     Katia Bryant|  14|
|Laurice Robertson|  53|
|     Minh Barrett|  27|
|   Latashia Perez|  52|
|      Elvina Ross|  68|
|  Augustus Snyder|  20|
|        Elois Cox|  65|
|    Jolanda Dixon|  14|
|      Rutha Young|  10|
| Waltraud Holland|  10|
|   Colton Flowers|  77|
|     Meri Hawkins|  43|
|     Theola Mason|  71|
|  Antonia Pearson|  25|
|   Delicia Murray|  41|
|    Cicely Harvey|  37|
+-----------------+----+
only showing top 20 rows



In [33]:
# average
data.select(data.name, data.age, data.country).groupBy("country").avg("age").show()

+--------------------+------------------+
|             country|          avg(age)|
+--------------------+------------------+
|                Chad|             36.25|
|            Paraguay| 47.77777777777778|
|            Anguilla|              72.0|
|               Macao|              72.0|
|Heard Island and ...|              30.0|
|             Senegal|              53.0|
|              Sweden|45.333333333333336|
|             Tokelau|34.166666666666664|
|French Southern T...|50.666666666666664|
|            Kiribati|48.666666666666664|
|   Republic of Korea|58.166666666666664|
|              Guyana|              39.0|
|             Eritrea|             39.75|
|              Jersey|              58.8|
|         Philippines|48.333333333333336|
|            Djibouti|              38.6|
|               Tonga|              49.0|
|      Norfolk Island|35.333333333333336|
|            Malaysia|60.666666666666664|
|           Singapore|              40.0|
+--------------------+------------

In [34]:
# average & Sort
data.select(data.name, data.age, data.country).groupBy("country").avg("age").sort("avg(age)").show()

+--------------------+------------------+
|             country|          avg(age)|
+--------------------+------------------+
|             Tunisia|              10.0|
|                Iran|              14.0|
|           Greenland|              14.5|
|                Cuba|              15.0|
|              Zambia|              16.0|
|          Costa Rica|              17.0|
|          Guadeloupe|              21.0|
|             Ireland|              21.0|
|            Suriname|23.333333333333332|
|    Saint Barthelemy|              24.0|
|              Taiwan|24.666666666666668|
|             Namibia|              25.0|
|             Moldova|             25.75|
|       Faroe Islands|              26.0|
|      Western Sahara|26.666666666666668|
|Bouvet Island (Bo...|26.666666666666668|
|Saint Kitts and N...|              27.0|
|             Vietnam|             27.25|
|   Equatorial Guinea|              27.5|
|           Gibraltar|27.666666666666668|
+--------------------+------------

In [35]:
# average & round
data.select(data.name, data.age, data.country).groupBy("country").agg(rnd(avg("age"), 2).alias("avg_age")).show()

+--------------------+-------+
|             country|avg_age|
+--------------------+-------+
|                Chad|  36.25|
|            Paraguay|  47.78|
|            Anguilla|   72.0|
|               Macao|   72.0|
|Heard Island and ...|   30.0|
|             Senegal|   53.0|
|              Sweden|  45.33|
|             Tokelau|  34.17|
|French Southern T...|  50.67|
|            Kiribati|  48.67|
|   Republic of Korea|  58.17|
|              Guyana|   39.0|
|             Eritrea|  39.75|
|              Jersey|   58.8|
|         Philippines|  48.33|
|            Djibouti|   38.6|
|               Tonga|   49.0|
|      Norfolk Island|  35.33|
|            Malaysia|  60.67|
|           Singapore|   40.0|
+--------------------+-------+
only showing top 20 rows

