<a href="https://colab.research.google.com/github/roitraining/SparkforDataEngineers/blob/Development/Ch01_IntroToSpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Create the Spark context to start a session and connect to the cluster.

In [1]:
import sys

rootpath = '/home/student/ROI/Spark/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
from pyspark_helpers import *
sc, spark, conf = initspark()


initializing pyspark
pyspark initialized


### Read a text file from the local file system.

In [2]:
shake = sc.textFile(f'{datapath}/text/shakespeare.txt')
print(shake.count())
print(shake.take(10))

124797
['King Joey is really Shakespeare', '\ufeffThe Project Gutenberg EBook of The Complete Works of William Shakespeare, by ', 'William Shakespeare', '', 'This eBook is for the use of anyone anywhere at no cost and with', 'almost no restrictions whatsoever.  You may copy it, give it away or', 're-use it under the terms of the Project Gutenberg License included', 'with this eBook or online at www.gutenberg.org', '', '** This is a COPYRIGHTED Project Gutenberg eBook, Details Below **']


### Use the map method to apply a function call on each element.

In [3]:
shake2 = shake.map(str.upper)
shake2.take(10)

sc.textFile(f'{datapath}/text/shakespeare.txt').map(str.upper).filter(lambda x : 'KING' in x).take(10)

# x would be a python list because of the take action
x = sc.textFile(f'{datapath}/text/shakespeare.txt') \
  .map(str.upper) \
  .filter(lambda x : 'KING' in x) \
  .take(10)

# x would be a RDD list because it's just a chain of transformations
x = sc.textFile(f'{datapath}/text/shakespeare.txt') \
  .map(str.upper) \
  .filter(lambda x : 'KING' in x) \


['KING JOEY IS REALLY SHAKESPEARE',
 '\ufeffTHE PROJECT GUTENBERG EBOOK OF THE COMPLETE WORKS OF WILLIAM SHAKESPEARE, BY ',
 'WILLIAM SHAKESPEARE',
 '',
 'THIS EBOOK IS FOR THE USE OF ANYONE ANYWHERE AT NO COST AND WITH',
 'ALMOST NO RESTRICTIONS WHATSOEVER.  YOU MAY COPY IT, GIVE IT AWAY OR',
 'RE-USE IT UNDER THE TERMS OF THE PROJECT GUTENBERG LICENSE INCLUDED',
 'WITH THIS EBOOK OR ONLINE AT WWW.GUTENBERG.ORG',
 '',
 '** THIS IS A COPYRIGHTED PROJECT GUTENBERG EBOOK, DETAILS BELOW **']

In [5]:
categories = sc.textFile('hdfs://localhost:9000/categories')
print(categories.count())
print(categories.collect())

8
['5,Grains/Cereals,Breads crackers pasta and cereal', '1,Beverages,Soft drinks coffees teas beers and ales', '2,Condiments,Sweet and savory sauces relishes spreads and seasonings', '3,Confections,Desserts candies and sweet breads', '4,Dairy Products,Cheeses', '6,Meat/Poultry,Prepared meats', '7,Produce,Dried fruit and bean curd', '8,Seafood,Seaweed and fish']


### Using the split method you get a list of lists.

In [14]:
shake3 = shake.map(lambda x : x.split(' '))
print(shake3.take(10))
print(shake3.count())

[['Joeyz'], ['The', 'Project', 'Gutenberg', 'EBook', 'of', 'The', 'Complete', 'Works', 'of', 'William', 'Shakespeare,', 'by', ''], ['William', 'Shakespeare'], [''], ['This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with'], ['almost', 'no', 'restrictions', 'whatsoever.', '', 'You', 'may', 'copy', 'it,', 'give', 'it', 'away', 'or'], ['re-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'Project', 'Gutenberg', 'License', 'included'], ['with', 'this', 'eBook', 'or', 'online', 'at', 'www.gutenberg.org'], [''], ['**', 'This', 'is', 'a', 'COPYRIGHTED', 'Project', 'Gutenberg', 'eBook,', 'Details', 'Below', '**']]
124797


### The flatMap method flattens the inner list to return one big list of strings instead

In [15]:
shake4 = shake.flatMap(lambda x : x.split(' '))
print(shake4.take(20))
print(shake4.count())

['Joeyz', 'The', 'Project', 'Gutenberg', 'EBook', 'of', 'The', 'Complete', 'Works', 'of', 'William', 'Shakespeare,', 'by', '', 'William', 'Shakespeare', '', 'This', 'eBook', 'is']
1410760


### Parallelize will load manually created data into the spark cluster into an RDD.

In [None]:
r = sc.parallelize(range(1,11))
print(r.collect())
print(r.take(5))

### Load a folder stored on HDFS.

In [6]:
cat = sc.textFile('hdfs://localhost:9000/categories')
cat.collect()

['5,Grains/Cereals,Breads crackers pasta and cereal',
 '1,Beverages,Soft drinks coffees teas beers and ales',
 '2,Condiments,Sweet and savory sauces relishes spreads and seasonings',
 '3,Confections,Desserts candies and sweet breads',
 '4,Dairy Products,Cheeses',
 '6,Meat/Poultry,Prepared meats',
 '7,Produce,Dried fruit and bean curd',
 '8,Seafood,Seaweed and fish']

### Other useful actions.

In [10]:
print('1', cat.sortBy(lambda x : x[2:]).take(5))
print(cat.takeOrdered(5))
print(cat.top(5))
print(cat.takeSample(False,5))


1 ['1,Beverages,Soft drinks coffees teas beers and ales', '2,Condiments,Sweet and savory sauces relishes spreads and seasonings', '3,Confections,Desserts candies and sweet breads', '4,Dairy Products,Cheeses', '5,Grains/Cereals,Breads crackers pasta and cereal']
['1,Beverages,Soft drinks coffees teas beers and ales', '2,Condiments,Sweet and savory sauces relishes spreads and seasonings', '3,Confections,Desserts candies and sweet breads', '4,Dairy Products,Cheeses', '5,Grains/Cereals,Breads crackers pasta and cereal']
['8,Seafood,Seaweed and fish', '7,Produce,Dried fruit and bean curd', '6,Meat/Poultry,Prepared meats', '5,Grains/Cereals,Breads crackers pasta and cereal', '4,Dairy Products,Cheeses']
['7,Produce,Dried fruit and bean curd', '5,Grains/Cereals,Breads crackers pasta and cereal', '2,Condiments,Sweet and savory sauces relishes spreads and seasonings', '1,Beverages,Soft drinks coffees teas beers and ales', '4,Dairy Products,Cheeses']


In [13]:
#print(cat.map(str.upper).collect())

def email(x):
    print (f'I am sending an email to {x}')

    
# for x in cat.collect():
#     email(x)

# for x in cat.toLocalIterator():
#     email(x)

cat.foreach(email)

### Save the results in an RDD to disk. Note how it makes a folder and fills it with as many files as there are nodes solving the problem. Also, you must make sure that the folder does not exist or it throws an exception.

In [None]:
! rm -r /home/student/file1.txt
cat.saveAsTextFile('/home/student/file1.txt')

In [None]:
print(cat.map(str.upper).collect())

### Parse the string into a tuple to resemble a record structure.

In [19]:
cat1 = cat.map(lambda x : tuple(x.split(',')))
cat1 = cat1.map(lambda x : (int(x[0]), x[1], x[2]))
cat1.take(10)

# cat1 = cat.map(lambda x : tuple(x.split(','))).map(lambda x : (int(x[0]), x[1], x[2]))
# cat1 = cat.map(lambda x : tuple(x.split(','))) \
#           .map(lambda x : (int(x[0]), x[1], x[2]))

cat1.sortBy(lambda x : x[2]).collect()

[(5, 'Grains/Cereals', 'Breads crackers pasta and cereal'),
 (4, 'Dairy Products', 'Cheeses'),
 (3, 'Confections', 'Desserts candies and sweet breads'),
 (7, 'Produce', 'Dried fruit and bean curd'),
 (6, 'Meat/Poultry', 'Prepared meats'),
 (8, 'Seafood', 'Seaweed and fish'),
 (1, 'Beverages', 'Soft drinks coffees teas beers and ales'),
 (2, 'Condiments', 'Sweet and savory sauces relishes spreads and seasonings')]

### ***LAB:*** Put the regions folder found in /home/student/ROI/Spark/datasets/northwind/CSV/regions into HDFS. Read it into an RDD and convert it into a tuple shape.

In [None]:
#! hadoop fs -rm -r /regions
#! hadoop fs -put /home/student/ROI/Spark/datasets/northwind/CSV/regions /regions
#regions = sc.textFile('hdfs://localhost:9000/regions')
#regions = regions.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))

regions = sc.textFile('hdfs://localhost:9000/regions').map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))

print (regions.collect())
#regions.saveAsTextFile('hdfs://localhost:9000/regions2')
print(regions.sortBy(lambda x : - x[0]).collect())
print(regions.sortBy(lambda x : x[1]).collect())


In [None]:
print(cat1.collect())
cat2 = cat1.map(lambda x : (x[0], (x[1], x[2])))
print(cat2.collect())
print(cat2.sortByKey().collect())

cat3 = cat1.map(lambda x : (x[0], 1))
cat3.collect()
print('*', cat3.reduceByKey(lambda x, y : x + y).collect())


### The filter method takes a lambda that returns a True or False.

In [20]:
cat1.filter(lambda x : x[0] <= 5).collect()


[(5, 'Grains/Cereals', 'Breads crackers pasta and cereal'),
 (1, 'Beverages', 'Soft drinks coffees teas beers and ales'),
 (2, 'Condiments', 'Sweet and savory sauces relishes spreads and seasonings'),
 (3, 'Confections', 'Desserts candies and sweet breads'),
 (4, 'Dairy Products', 'Cheeses')]

### The filter expressions can be more complicated.

In [None]:
cat1.filter(lambda x : x[0] % 2 == 0 and 'e' in x[1]).collect()

### The sortBy method returns an expression that is used to sort the data.

In [None]:
cat1.sortBy(lambda x : x[2]).collect()

### sortBy has an option ascending parameter to sort in reverse order.

In [None]:
cat1.sortBy(lambda x : x[0], ascending = False).collect()

In [None]:
students = sc.parallelize([{'ID':1, 'FirstName':'Joey'}, {'ID':2, 'FirstName':'Mary'}])
