# Section 2

In [20]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
import collections


In [None]:
# Spark session & context
conf = SparkConf().setMaster("local").setAppName("Section2")
sc = SparkContext(conf = conf)

In [None]:
DATA_ROOT = "../spark_course_resources"

## Temperature

In [6]:
# min-temperatures.py
def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)

lines = sc.textFile(f"{DATA_ROOT}/1800.csv")
parsedLines = lines.map(parseLine)
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1])
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y))
results = minTemps.collect();

for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))


ITE00100554	5.36F
EZE00100082	7.70F


In [7]:
# max-temperatures.py
lines = sc.textFile(f"{DATA_ROOT}/1800.csv")
parsedLines = lines.map(parseLine)
minTemps = parsedLines.filter(lambda x: "TMAX" in x[1])
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
minTemps = stationTemps.reduceByKey(lambda x, y: max(x,y))
results = minTemps.collect();

for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))


ITE00100554	90.14F
EZE00100082	90.14F


## Word count

In [11]:
input = sc.textFile(f"{DATA_ROOT}/book.txt")
words = input.flatMap(lambda x: x.split())
wordCounts = words.countByValue()

for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if (cleanWord):
        pass # print(cleanWord.decode() + " " + str(count))


In [14]:
import re

def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())

words = input.flatMap(normalizeWords)
wordCounts = words.countByValue()

for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if (cleanWord):
        pass  # print(cleanWord.decode() + " " + str(count))

In [16]:
# Sort Results
wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
wordCountsSorted = wordCounts.map(lambda x: (x[1], x[0])).sortByKey()
results = wordCountsSorted.collect()

for result in results:
    count = str(result[0])
    word = result[1].encode('ascii', 'ignore')
    if (word):
        pass  # print(word.decode() + ":\t\t" + count)



In [22]:
spark = SparkSession.builder.appName("WordCount").getOrCreate()

# Read each line of my book into a dataframe
inputDF = spark.read.text(f"{DATA_ROOT}/book.txt")

# Split using a regular expression that extracts words
words = inputDF.select(func.explode(func.split(inputDF.value, "\\W+")).alias("word"))
wordsWithoutEmptyString = words.filter(words.word != "")

# Normalize everything to lowercase
lowercaseWords = wordsWithoutEmptyString.select(func.lower(wordsWithoutEmptyString.word).alias("word"))

# Count up the occurrences of each word
wordCounts = lowercaseWords.groupBy("word").count()

# Sort by counts
wordCountsSorted = wordCounts.sort("count")

# Show the results.
# wordCountsSorted.show(wordCountsSorted.count())


## Customer Orders

In [38]:
def extract_customer_price_pairs(line):
    fields = line.split(',')
    return (int(fields[0]), float(fields[2]))

lines = sc.textFile(f"{DATA_ROOT}/customer-orders.csv")
mappedInput = lines.map(extract_customer_price_pairs)
total_by_costumer = mappedInput.reduceByKey(lambda x, y: x + y)

flipped = total_by_costumer.map(lambda x: (x[1], x[0]))

totalByCustomerSorted = flipped.sortByKey()

results = total_by_costumer.collect();

for result in results:
    print(result)

(44, 4756.8899999999985)
(35, 5155.419999999999)
(2, 5994.59)
(47, 4316.299999999999)
(29, 5032.529999999999)
(91, 4642.259999999999)
(70, 5368.249999999999)
(85, 5503.43)
(53, 4945.299999999999)
(14, 4735.030000000001)
(51, 4975.22)
(42, 5696.840000000003)
(79, 3790.570000000001)
(50, 4517.27)
(20, 4836.859999999999)
(15, 5413.510000000001)
(5, 4561.069999999999)
(48, 4384.33)
(31, 4765.05)
(4, 4815.050000000002)
(36, 4278.049999999997)
(57, 4628.4)
(12, 4664.589999999998)
(22, 5019.449999999999)
(54, 6065.389999999999)
(0, 5524.949999999998)
(88, 4830.549999999999)
(86, 4908.81)
(13, 4367.62)
(40, 5186.429999999999)
(98, 4297.260000000001)
(55, 5298.090000000002)
(95, 4876.840000000002)
(61, 5497.479999999998)
(27, 4915.889999999999)
(78, 4524.509999999999)
(83, 4635.799999999997)
(6, 5397.879999999998)
(26, 5250.4)
(75, 4178.500000000001)
(25, 5057.610000000001)
(71, 5995.660000000003)
(39, 6193.109999999999)
(60, 5040.709999999999)
(97, 5977.189999999995)
(7, 4755.070000000001)
(21

Py4JJavaError: An error occurred while calling o528.partitions.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/sparkcourse/customer-orders.csv
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:208)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:291)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:291)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:287)
	at org.apache.spark.api.java.JavaRDDLike.partitions(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.JavaRDDLike.partitions$(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
	at jdk.internal.reflect.GeneratedMethodAccessor61.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.IOException: Input path does not exist: file:/sparkcourse/customer-orders.csv
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
	... 24 more
