<a href="https://colab.research.google.com/github/roitraining/SparkforDataEngineers/blob/Development/Ch01_IntroToSpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Create the Spark context to start a session and connect to the cluster.

In [2]:
import sys

rootpath = '/home/student/ROI/Spark/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
from pyspark_helpers import *
sc, spark, conf = initspark()


initializing pyspark
pyspark initialized


### Read a text file from the local file system.

In [3]:
shake = sc.textFile(f'{datapath}/text/shakespeare.txt')
print(shake.count())
print(shake.take(10))

124797
['King Joey is really Shakespeare', '\ufeffThe Project Gutenberg EBook of The Complete Works of William Shakespeare, by ', 'William Shakespeare', '', 'This eBook is for the use of anyone anywhere at no cost and with', 'almost no restrictions whatsoever.  You may copy it, give it away or', 're-use it under the terms of the Project Gutenberg License included', 'with this eBook or online at www.gutenberg.org', '', '** This is a COPYRIGHTED Project Gutenberg eBook, Details Below **']


### Use the map method to apply a function call on each element.

In [4]:
shake2 = shake.map(str.upper)
shake2.take(10)

['KING JOEY IS REALLY SHAKESPEARE',
 '\ufeffTHE PROJECT GUTENBERG EBOOK OF THE COMPLETE WORKS OF WILLIAM SHAKESPEARE, BY ',
 'WILLIAM SHAKESPEARE',
 '',
 'THIS EBOOK IS FOR THE USE OF ANYONE ANYWHERE AT NO COST AND WITH',
 'ALMOST NO RESTRICTIONS WHATSOEVER.  YOU MAY COPY IT, GIVE IT AWAY OR',
 'RE-USE IT UNDER THE TERMS OF THE PROJECT GUTENBERG LICENSE INCLUDED',
 'WITH THIS EBOOK OR ONLINE AT WWW.GUTENBERG.ORG',
 '',
 '** THIS IS A COPYRIGHTED PROJECT GUTENBERG EBOOK, DETAILS BELOW **']

### Using the split method you get a list of lists.

In [9]:
shake3 = shake.map(lambda x : x.split(' '))
print(shake3.take(10))
print(shake3.count())

[['King', 'Joey', 'is', 'really', 'Shakespeare'], ['\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'The', 'Complete', 'Works', 'of', 'William', 'Shakespeare,', 'by', ''], ['William', 'Shakespeare'], [''], ['This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with'], ['almost', 'no', 'restrictions', 'whatsoever.', '', 'You', 'may', 'copy', 'it,', 'give', 'it', 'away', 'or'], ['re-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'Project', 'Gutenberg', 'License', 'included'], ['with', 'this', 'eBook', 'or', 'online', 'at', 'www.gutenberg.org'], [''], ['**', 'This', 'is', 'a', 'COPYRIGHTED', 'Project', 'Gutenberg', 'eBook,', 'Details', 'Below', '**']]
124797


### The flatMap method flattens the inner list to return one big list of strings instead

In [11]:
shake4 = shake.flatMap(lambda x : x.split(' '))
print(shake4.take(20))
print(shake4.count())

['King', 'Joey', 'is', 'really', 'Shakespeare', '\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'The', 'Complete', 'Works', 'of', 'William', 'Shakespeare,', 'by', '', 'William', 'Shakespeare']
1410764


### Parallelize will load manually created data into the spark cluster into an RDD.

In [5]:
r = sc.parallelize(range(1,11))
print(r.collect())
print(r.take(5))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[1, 2, 3, 4, 5]


### Load a folder stored on HDFS.

In [13]:
cat = sc.textFile('hdfs://localhost:9000/categories')
cat.collect()

['4,Dairy Products,Cheeses',
 '1,Beverages,Soft drinks coffees teas beers and ales',
 '2,Condiments,Sweet and savory sauces relishes spreads and seasonings',
 '3,Confections,Desserts candies and sweet breads',
 '5,Grains/Cereals,Breads crackers pasta and cereal',
 '6,Meat/Poultry,Prepared meats',
 '7,Produce,Dried fruit and bean curd',
 '8,Seafood,Seaweed and fish']

### Other useful actions.

In [18]:
print('1', cat.sortBy(lambda x : x).take(5))
print(cat.takeOrdered(5))
print(cat.top(5))
print(cat.takeSample(True,5))


1 ['1,Beverages,Soft drinks coffees teas beers and ales', '2,Condiments,Sweet and savory sauces relishes spreads and seasonings', '3,Confections,Desserts candies and sweet breads', '4,Dairy Products,Cheeses', '5,Grains/Cereals,Breads crackers pasta and cereal']
['1,Beverages,Soft drinks coffees teas beers and ales', '2,Condiments,Sweet and savory sauces relishes spreads and seasonings', '3,Confections,Desserts candies and sweet breads', '4,Dairy Products,Cheeses', '5,Grains/Cereals,Breads crackers pasta and cereal']
['8,Seafood,Seaweed and fish', '7,Produce,Dried fruit and bean curd', '6,Meat/Poultry,Prepared meats', '5,Grains/Cereals,Breads crackers pasta and cereal', '4,Dairy Products,Cheeses']
['5,Grains/Cereals,Breads crackers pasta and cereal', '4,Dairy Products,Cheeses', '1,Beverages,Soft drinks coffees teas beers and ales', '3,Confections,Desserts candies and sweet breads', '4,Dairy Products,Cheeses']


In [20]:
print(cat.map(str.upper).collect())

def email(x):
    print (f'I am sending an email to {x}')
    
cat.foreach(email)

['4,DAIRY PRODUCTS,CHEESES', '1,BEVERAGES,SOFT DRINKS COFFEES TEAS BEERS AND ALES', '2,CONDIMENTS,SWEET AND SAVORY SAUCES RELISHES SPREADS AND SEASONINGS', '3,CONFECTIONS,DESSERTS CANDIES AND SWEET BREADS', '5,GRAINS/CEREALS,BREADS CRACKERS PASTA AND CEREAL', '6,MEAT/POULTRY,PREPARED MEATS', '7,PRODUCE,DRIED FRUIT AND BEAN CURD', '8,SEAFOOD,SEAWEED AND FISH']


### Save the results in an RDD to disk. Note how it makes a folder and fills it with as many files as there are nodes solving the problem. Also, you must make sure that the folder does not exist or it throws an exception.

In [22]:
! rm -r /home/student/file1.txt
cat.saveAsTextFile('/home/student/file1.txt')

Py4JJavaError: An error occurred while calling o492.saveAsTextFile.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory file:/home/student/file1.txt already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.assertConf(SparkHadoopWriter.scala:287)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:71)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:958)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:957)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1499)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1478)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1478)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1478)
	at org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:550)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:45)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
print(cat.map(str.upper).collect())

### Parse the string into a tuple to resemble a record structure.

In [24]:
cat1 = cat.map(lambda x : tuple(x.split(',')))
cat1 = cat1.map(lambda x : (int(x[0]), x[1], x[2]))
cat1.take(10)

cat1 = cat.map(lambda x : tuple(x.split(','))).map(lambda x : (int(x[0]), x[1], x[2]))
cat1 = cat.map(lambda x : tuple(x.split(','))) \
          .map(lambda x : (int(x[0]), x[1], x[2]))


[(4, 'Dairy Products', 'Cheeses'),
 (1, 'Beverages', 'Soft drinks coffees teas beers and ales'),
 (2, 'Condiments', 'Sweet and savory sauces relishes spreads and seasonings'),
 (3, 'Confections', 'Desserts candies and sweet breads'),
 (5, 'Grains/Cereals', 'Breads crackers pasta and cereal'),
 (6, 'Meat/Poultry', 'Prepared meats'),
 (7, 'Produce', 'Dried fruit and bean curd'),
 (8, 'Seafood', 'Seaweed and fish')]

### ***LAB:*** Put the regions folder found in /home/student/ROI/Spark/datasets/northwind/CSV/regions into HDFS. Read it into an RDD and convert it into a tuple shape.

In [34]:
#! hadoop fs -rm -r /regions
#! hadoop fs -put /home/student/ROI/Spark/datasets/northwind/CSV/regions /regions
#regions = sc.textFile('hdfs://localhost:9000/regions')
#regions = regions.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))

regions = sc.textFile('hdfs://localhost:9000/regions').map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))

print (regions.collect())
#regions.saveAsTextFile('hdfs://localhost:9000/regions2')
print(regions.sortBy(lambda x : - x[0]).collect())
print(regions.sortBy(lambda x : x[1]).collect())


[(5, 'Artic'), (6, 'Desert'), (1, 'Eastern'), (2, 'Western'), (3, 'Northern'), (4, 'Southern')]
[(6, 'Desert'), (5, 'Artic'), (4, 'Southern'), (3, 'Northern'), (2, 'Western'), (1, 'Eastern')]
[(5, 'Artic'), (6, 'Desert'), (1, 'Eastern'), (3, 'Northern'), (4, 'Southern'), (2, 'Western')]


In [42]:
print(cat1.collect())
cat2 = cat1.map(lambda x : (x[0], (x[1], x[2])))
print(cat2.collect())
print(cat2.sortByKey().collect())

cat3 = cat1.map(lambda x : (x[0], 1))
cat3.collect()
print('*', cat3.reduceByKey(lambda x, y : x + y).collect())


[(4, 'Dairy Products', 'Cheeses'), (1, 'Beverages', 'Soft drinks coffees teas beers and ales'), (2, 'Condiments', 'Sweet and savory sauces relishes spreads and seasonings'), (3, 'Confections', 'Desserts candies and sweet breads'), (5, 'Grains/Cereals', 'Breads crackers pasta and cereal'), (6, 'Meat/Poultry', 'Prepared meats'), (7, 'Produce', 'Dried fruit and bean curd'), (8, 'Seafood', 'Seaweed and fish')]
[(4, ('Dairy Products', 'Cheeses')), (1, ('Beverages', 'Soft drinks coffees teas beers and ales')), (2, ('Condiments', 'Sweet and savory sauces relishes spreads and seasonings')), (3, ('Confections', 'Desserts candies and sweet breads')), (5, ('Grains/Cereals', 'Breads crackers pasta and cereal')), (6, ('Meat/Poultry', 'Prepared meats')), (7, ('Produce', 'Dried fruit and bean curd')), (8, ('Seafood', 'Seaweed and fish'))]
[(1, ('Beverages', 'Soft drinks coffees teas beers and ales')), (2, ('Condiments', 'Sweet and savory sauces relishes spreads and seasonings')), (3, ('Confections', 

### The filter method takes a lambda that returns a True or False.

In [None]:
cat1.filter(lambda x : x[0] <= 5).collect()


### The filter expressions can be more complicated.

In [None]:
cat1.filter(lambda x : x[0] % 2 == 0 and 'e' in x[1]).collect()

### The sortBy method returns an expression that is used to sort the data.

In [None]:
cat1.sortBy(lambda x : x[2]).collect()

### sortBy has an option ascending parameter to sort in reverse order.

In [None]:
cat1.sortBy(lambda x : x[0], ascending = False).collect()