<a href="https://colab.research.google.com/github/roitraining/jpmc_hadoop/blob/master/notebooks/Ch04_IntroToSpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Create the Spark context to start a session and connect to the cluster.

In [0]:
import sys

rootpath = '/class/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
from pyspark_helpers import *
sc, spark, conf = initspark()


### Read a text file from the local file system.

In [0]:
shake = sc.textFile(f'{datapath}/text/shakespeare.txt')
print(shake.count())
print(shake.take(10))

### Use the map method to apply a function call on each element.

In [0]:
shake2 = shake.map(str.upper)
shake2.take(10)

### Using the split method you get a list of lists.

In [0]:
shake3 = shake.map(lambda x : x.split(' '))
print(shake3.take(10))
print(shake3.count())

In [0]:
### The flatMap method flattens the inner list to return one big list of strings instead

In [0]:
shake4 = shake.flatMap(lambda x : x.split(' '))
print(shake4.take(20))
print(shake4.count())

### Parallelize will load manually created data into the spark cluster into an RDD.

In [0]:
r = sc.parallelize(range(1,11))
print(r.collect())
print(r.take(5))

### Load a folder stored on HDFS.

In [0]:
cat = sc.textFile('hdfs://localhost:9000/categories')
cat.collect()

### Other useful actions.

In [0]:
print('1', cat.sortBy(lambda x : x).take(5))
print(cat.takeOrdered(5))
print(cat.top(5))
print(cat.takeSample(True,5))


In [0]:
print(cat.map(str.upper).collect())

def email(x):
    print (f'I am sending an email to {x}')
    
cat.foreach(email)

### Save the results in an RDD to disk. Note how it makes a folder and fills it with as many files as there are nodes solving the problem. Also, you must make sure that the folder does not exist or it throws an exception.

In [0]:
! rm -r /home/student/file1.txt
cat.saveAsTextFile('/home/student/file1.txt')

In [0]:
print(cat.map(str.upper).collect())

### Parse the string into a tuple to resemble a record structure.

In [0]:
cat1 = cat.map(lambda x : tuple(x.split(',')))
cat1 = cat1.map(lambda x : (int(x[0]), x[1], x[2]))
cat1.take(10)

cat1 = cat.map(lambda x : tuple(x.split(','))).map(lambda x : (int(x[0]), x[1], x[2]))
cat1 = cat.map(lambda x : tuple(x.split(','))) \
          .map(lambda x : (int(x[0]), x[1], x[2]))


### ***LAB:*** Put the regions folder found in /class/datasets/northwind/CSV/regions into HDFS. Read it into an RDD and convert it into a tuple shape.

In [0]:
# ! hadoop fs -rm -r /regions
# ! hadoop fs -put /class/datasets/northwind/CSV/regions /regions
############################################################################################## regions = sc.textFile('hdfs://localhost:9000/regions')
############################################################################################## regions = regions.map(lambda x : x.split(',')).map(lambda x : (int(x[0]), x[1]))
############################################################################################## print(regions.collect())

### The filter method takes a lambda that returns a True or False.

In [0]:
cat1.filter(lambda x : x[0] <= 5).collect()


### The filter expressions can be more complicated.

In [0]:
cat1.filter(lambda x : x[0] % 2 == 0 and 'e' in x[1]).collect()

### The sortBy method returns an expression that is used to sort the data.

In [0]:
cat1.sortBy(lambda x : x[2]).collect()

### sortBy has an ascending parameter option to sort in reverse order.

In [0]:
cat1.sortBy(lambda x : x[0], ascending = False).collect()