<a href="https://colab.research.google.com/github/ralsouza/apache_spark_real_time_analytics/blob/master/02_pyspark_transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install & Setup

In [1]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
# Environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
 
# tornar o pyspark "importável"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [3]:
# Libraries and Context Setup
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [None]:
# Check context
print(sc)

<SparkContext master=local[*] appName=pyspark-shell>


# Transformation

In [None]:
# Python list
lst1 = [124,940,652,102,397]

In [None]:
# Check data type
type(lst1)

list

In [None]:
# Load data into a collection
lst1_rdd = sc.parallelize(lst1)

In [None]:
# Check data type
type(lst1_rdd)

pyspark.rdd.RDD

In [None]:
# Print collection
lst1_rdd.collect()

[124, 940, 652, 102, 397]

In [None]:
# Counting elements
lst1_rdd.count()

5

In [None]:
# Load RDD from a text file
file = '/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/carros.csv'
auto_data_rdd = sc.textFile(file)

In [None]:
# Check data type
type(auto_data_rdd)

pyspark.rdd.RDD

In [None]:
# An action operation - Returning the first row from dataset, the header
auto_data_rdd.first()

'MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE'

In [None]:
# Returning the 5 firsts rows and the header
auto_data_rdd.take(5)

['MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118',
 'chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151',
 'mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348']

In [None]:
# Each partition generates a new computation process
# But we can persist the data into a cache to be used for other actions
# without necessity of a new computation
auto_data_rdd.cache()

/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/carros.csv MapPartitionsRDD[4] at textFile at NativeMethodAccessorImpl.java:0

In [None]:
# List all rows
for row in auto_data_rdd.collect():
  print(row)

MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE
subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118
chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151
mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195
toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,37,41,5389
honda,gas,std,two,hatchback,fwd,four,60,5500,38,42,5399
nissan,gas,std,two,sedan,fwd,four,69,5200,31,37,5499
dodge,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
plymouth,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
mazda,gas,std,two,hatchback,fwd,four,68,5000,31,38,6095
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,31,38,6189
dodge,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
plymouth,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
chevrolet,gas,std,two,hatchback,fwd,four,70,5400,38,43,6295
toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338
dodge,gas,std,two,hatchback,fwd,four,68,5500,31,38,6377

In [None]:
# map() to create a new rdd - Transformation (Lazy Evaluation)
tsv_data = auto_data_rdd.map(lambda x:x.replace(',','\t'))
tsv_data.take(5)

['MAKE\tFUELTYPE\tASPIRE\tDOORS\tBODY\tDRIVE\tCYLINDERS\tHP\tRPM\tMPG-CITY\tMPG-HWY\tPRICE',
 'subaru\tgas\tstd\ttwo\thatchback\tfwd\tfour\t69\t4900\t31\t36\t5118',
 'chevrolet\tgas\tstd\ttwo\thatchback\tfwd\tthree\t48\t5100\t47\t53\t5151',
 'mazda\tgas\tstd\ttwo\thatchback\tfwd\tfour\t68\t5000\t30\t31\t5195',
 'toyota\tgas\tstd\ttwo\thatchback\tfwd\tfour\t62\t4800\t35\t39\t5348']

In [None]:
# Filter by Toyota word
toyota_data = auto_data_rdd.filter(lambda x: 'toyota' in x)

In [None]:
# Action
toyota_data.count()

32

In [None]:
# See results
toyota_data.take(20)

['toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338',
 'toyota,gas,std,four,hatchback,fwd,four,62,4800,31,38,6488',
 'toyota,gas,std,four,wagon,fwd,four,62,4800,31,37,6918',
 'toyota,gas,std,four,sedan,fwd,four,70,4800,30,37,6938',
 'toyota,gas,std,four,hatchback,fwd,four,70,4800,30,37,7198',
 'toyota,gas,std,four,sedan,fwd,four,70,4800,38,47,7738',
 'toyota,diesel,std,four,hatchback,fwd,four,56,4500,38,47,7788',
 'toyota,gas,std,four,wagon,4wd,four,62,4800,27,32,7898',
 'toyota,diesel,std,four,sedan,fwd,four,56,4500,34,36,7898',
 'toyota,gas,std,two,sedan,rwd,four,70,4800,29,34,8058',
 'toyota,gas,std,two,hatchback,rwd,four,70,4800,29,34,8238',
 'toyota,gas,std,four,hatchback,fwd,four,70,4800,28,34,8358',
 'toyota,gas,std,two,hardtop,rwd,four,116,4800,24,30,8449',
 'toyota,gas,std,four,wagon,4wd,four,62,4800,27,32,8778',
 'toyota,gas,std,four,sedan,fwd,four,92,4200,29,34,8948',
 'toyota,gas,std,four,sedan,fwd,four,70,

In [None]:
# Write results
save_rdd = open('/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/toyota_data.csv','w')
save_rdd.write('\n'.join(auto_data_rdd.collect()))
save_rdd.close()

## Set, Outer Join and Distinct Operations

In [None]:
# Set operations
words1 = sc.parallelize(['Big Data','Data Science','Analytics','Visualization'])
words2 = sc.parallelize(['Big Data','R','Python','Scala'])

### Union

In [None]:
# Union
for unions in words1.union(words2).distinct().collect():
  print(unions)

Big Data
Python
Data Science
Analytics
R
Visualization
Scala


In [None]:
rdd01 = sc.parallelize(range(1,10))
rdd02 = sc.parallelize(range(10,21))

rdd01.union(rdd02).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

### Intersection

In [None]:
# Intersection
for intersection in words1.intersection(words2).distinct().collect():
  print(intersection)

Big Data


In [None]:
rdd03 = sc.parallelize(range(1,10))
rdd04 = sc.parallelize(range(5,15))

rdd03.intersection(rdd04).collect()

[8, 5, 9, 6, 7]

### Left/Rigth Outer Join

In [None]:
# Join
names1 = sc.parallelize(('banana','grape','orange')).map(lambda a: (a,1))
names2 = sc.parallelize(('orange','pineapple','strawberry')).map(lambda a: (a,1))

names1.join(names2).collect()

[('orange', (1, 1))]

In [None]:
# Left
names1.leftOuterJoin(names2).collect()

[('orange', (1, 1)), ('grape', (1, None)), ('banana', (1, None))]

In [None]:
# Right
names1.rightOuterJoin(names2).collect()

[('orange', (1, 1)), ('pineapple', (None, 1)), ('strawberry', (None, 1))]

### Distinct

In [None]:
list1 = [124,901,652,102,397,124,397,652]

In [None]:
list_rdd = sc.parallelize(list1)

In [None]:
for num_data in list_rdd.distinct().collect():
  print(num_data)

124
652
102
901
397


## Cleaning

In [None]:
# Load RDD from a text file
file = '/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/carros.csv'
auto_data_rdd = sc.textFile(file)

In [None]:
auto_data_rdd.collect()

['MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118',
 'chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151',
 'mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348',
 'mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,37,41,5389',
 'honda,gas,std,two,hatchback,fwd,four,60,5500,38,42,5399',
 'nissan,gas,std,two,sedan,fwd,four,69,5200,31,37,5499',
 'dodge,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572',
 'plymouth,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572',
 'mazda,gas,std,two,hatchback,fwd,four,68,5000,31,38,6095',
 'mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,31,38,6189',
 'dodge,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229',
 'plymouth,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229',
 'chevrolet,gas,std,two,hatchback,fwd,four,70,5400,38,43,6295',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,31,3

In [None]:
# Cleaning funtion
def clean_rdd(auto_str):
  ''' Function to clean dataset auto '''

  # Check indexing
  if isinstance(auto_str,int):
    return auto_str

  # Split each index by comma
  att_list = auto_str.split(',')

  # Converts the door number to a number
  if att_list[3] == 'two':
    att_list[3] = '2'
  elif att_list[3] == 'four':
    att_list[3] = '4'

  # Converts the car model to uppercase
  att_list[5] = att_list[4].upper()
  return ','.join(att_list)

In [None]:
# Transformation - Put the job in the queue 
clean_rdd = auto_data_rdd.map(clean_rdd)

In [None]:
print(clean_rdd)

PythonRDD[83] at RDD at PythonRDD.scala:53


In [None]:
# Process the transformation
clean_rdd.collect()

['MAKE,FUELTYPE,ASPIRE,DOORS,BODY,BODY,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,2,hatchback,HATCHBACK,four,69,4900,31,36,5118',
 'chevrolet,gas,std,2,hatchback,HATCHBACK,three,48,5100,47,53,5151',
 'mazda,gas,std,2,hatchback,HATCHBACK,four,68,5000,30,31,5195',
 'toyota,gas,std,2,hatchback,HATCHBACK,four,62,4800,35,39,5348',
 'mitsubishi,gas,std,2,hatchback,HATCHBACK,four,68,5500,37,41,5389',
 'honda,gas,std,2,hatchback,HATCHBACK,four,60,5500,38,42,5399',
 'nissan,gas,std,2,sedan,SEDAN,four,69,5200,31,37,5499',
 'dodge,gas,std,2,hatchback,HATCHBACK,four,68,5500,37,41,5572',
 'plymouth,gas,std,2,hatchback,HATCHBACK,four,68,5500,37,41,5572',
 'mazda,gas,std,2,hatchback,HATCHBACK,four,68,5000,31,38,6095',
 'mitsubishi,gas,std,2,hatchback,HATCHBACK,four,68,5500,31,38,6189',
 'dodge,gas,std,4,hatchback,HATCHBACK,four,68,5500,31,38,6229',
 'plymouth,gas,std,4,hatchback,HATCHBACK,four,68,5500,31,38,6229',
 'chevrolet,gas,std,2,hatchback,HATCHBACK,four,70,5400,38,43,6295',
 't

# Actions

In [4]:
# Reduce - Sum two values
lst2 = [144,901,652,102,397,124,901,652]
lst_rdd = sc.parallelize(lst2)

In [5]:
lst_rdd.collect()

[144, 901, 652, 102, 397, 124, 901, 652]

In [6]:
lst_rdd.reduce(lambda x,y: x + y)

3873

In [7]:
# Load RDD from a text file
file = '/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/carros.csv'
auto_data_rdd = sc.textFile(file)

In [8]:
auto_data_rdd.reduce(lambda x,y: x if len(x) < len(y) else y)

'bmw,gas,std,two,sedan,rwd,six,182,5400,16,22,41315'

In [9]:
# Making a reduction function
def get_mpg(auto_str):
  if isinstance(auto_str,int):
    return auto_str
  
  att_list = auto_str.split(',')

  if att_list[9].isdigit():
    return int(att_list[9])
  else:
    return 0

In [11]:
# Finding the mean
mean_mpg = round(auto_data_rdd.reduce(lambda x,y: get_mpg(x) + get_mpg(y)) / (auto_data_rdd.count() -1), 2)
print(mean_mpg)

25.15


In [15]:
# Sampling data
teams = sc.parallelize(['flamengo','vasco','palmeiras','botafogo','bahia'])
teams.takeSample(True,3)

['flamengo', 'palmeiras', 'palmeiras']

In [17]:
# Count by key
teams = sc.parallelize(['flamengo','vasco','palmeiras','botafogo','bahia','bahia','palmeiras'])
teams.map(lambda k: (k,1)).countByKey().items()

dict_items([('flamengo', 1), ('vasco', 1), ('palmeiras', 2), ('botafogo', 1), ('bahia', 2)])

In [18]:
# Saving results
auto_data_rdd.saveAsTextFile('/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/teams.txt')