<a href="https://colab.research.google.com/github/ralsouza/apache_spark_real_time_analytics/blob/master/notebooks/02_pyspark_transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install & Setup

In [1]:
# Install the dependencies
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
# Environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
 
# tornar o pyspark "importável"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [3]:
# Libraries and Context Setup
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [4]:
# Check context
print(sc)

<SparkContext master=local[*] appName=pyspark-shell>


# Transformation

In [5]:
# Python list
lst1 = [124,940,652,102,397]

In [6]:
# Check data type
type(lst1)

list

In [9]:
# Load data into a collection
lst1_rdd = sc.parallelize(lst1)

In [11]:
# Check data type
type(lst1_rdd)

pyspark.rdd.RDD

In [13]:
# Print collection
lst1_rdd.collect()

[124, 940, 652, 102, 397]

In [14]:
# Counting elements
lst1_rdd.count()

5

In [15]:
# Load RDD from a text file
file = '/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/carros.csv'
auto_data_rdd = sc.textFile(file)

In [16]:
# Check data type
type(auto_data_rdd)

pyspark.rdd.RDD

In [17]:
# An action operation - Returning the first row from dataset, the header
auto_data_rdd.first()

'MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE'

In [18]:
# Returning the 5 firsts rows and the header
auto_data_rdd.take(5)

['MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118',
 'chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151',
 'mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348']

In [19]:
# Each partition generates a new computation process
# But we can persist the data into a cache to be used for other actions
# without necessity of a new computation
auto_data_rdd.cache()

/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/carros.csv MapPartitionsRDD[4] at textFile at NativeMethodAccessorImpl.java:0

In [50]:
# List all rows
for row in auto_data_rdd.collect():
  print(row)

MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE
subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118
chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151
mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195
toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,37,41,5389
honda,gas,std,two,hatchback,fwd,four,60,5500,38,42,5399
nissan,gas,std,two,sedan,fwd,four,69,5200,31,37,5499
dodge,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
plymouth,gas,std,two,hatchback,fwd,four,68,5500,37,41,5572
mazda,gas,std,two,hatchback,fwd,four,68,5000,31,38,6095
mitsubishi,gas,std,two,hatchback,fwd,four,68,5500,31,38,6189
dodge,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
plymouth,gas,std,four,hatchback,fwd,four,68,5500,31,38,6229
chevrolet,gas,std,two,hatchback,fwd,four,70,5400,38,43,6295
toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338
dodge,gas,std,two,hatchback,fwd,four,68,5500,31,38,6377

In [52]:
# map() to create a new rdd - Transformation (Lazy Evaluation)
tsv_data = auto_data_rdd.map(lambda x:x.replace(',','\t'))
tsv_data.take(5)

['MAKE\tFUELTYPE\tASPIRE\tDOORS\tBODY\tDRIVE\tCYLINDERS\tHP\tRPM\tMPG-CITY\tMPG-HWY\tPRICE',
 'subaru\tgas\tstd\ttwo\thatchback\tfwd\tfour\t69\t4900\t31\t36\t5118',
 'chevrolet\tgas\tstd\ttwo\thatchback\tfwd\tthree\t48\t5100\t47\t53\t5151',
 'mazda\tgas\tstd\ttwo\thatchback\tfwd\tfour\t68\t5000\t30\t31\t5195',
 'toyota\tgas\tstd\ttwo\thatchback\tfwd\tfour\t62\t4800\t35\t39\t5348']

In [53]:
# Filter by Toyota word
toyota_data = auto_data_rdd.filter(lambda x: 'toyota' in x)

In [54]:
# Action
toyota_data.count()

32

In [55]:
# See results
toyota_data.take(20)

['toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338',
 'toyota,gas,std,four,hatchback,fwd,four,62,4800,31,38,6488',
 'toyota,gas,std,four,wagon,fwd,four,62,4800,31,37,6918',
 'toyota,gas,std,four,sedan,fwd,four,70,4800,30,37,6938',
 'toyota,gas,std,four,hatchback,fwd,four,70,4800,30,37,7198',
 'toyota,gas,std,four,sedan,fwd,four,70,4800,38,47,7738',
 'toyota,diesel,std,four,hatchback,fwd,four,56,4500,38,47,7788',
 'toyota,gas,std,four,wagon,4wd,four,62,4800,27,32,7898',
 'toyota,diesel,std,four,sedan,fwd,four,56,4500,34,36,7898',
 'toyota,gas,std,two,sedan,rwd,four,70,4800,29,34,8058',
 'toyota,gas,std,two,hatchback,rwd,four,70,4800,29,34,8238',
 'toyota,gas,std,four,hatchback,fwd,four,70,4800,28,34,8358',
 'toyota,gas,std,two,hardtop,rwd,four,116,4800,24,30,8449',
 'toyota,gas,std,four,wagon,4wd,four,62,4800,27,32,8778',
 'toyota,gas,std,four,sedan,fwd,four,92,4200,29,34,8948',
 'toyota,gas,std,four,sedan,fwd,four,70,

In [56]:
# Write results
save_rdd = open('/content/drive/My Drive/Colab Notebooks/08-apache-spark/data/toyota_data.csv','w')
save_rdd.write('\n'.join(auto_data_rdd.collect()))
save_rdd.close()