# **Prática 01: Configuração do Ambiente**

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [2]:
!wget -q https://archive.apache.org/dist/spark/spark-3.2.2/spark-3.2.2-bin-hadoop3.2.tgz

In [3]:
!tar xf spark-3.2.2-bin-hadoop3.2.tgz

In [4]:
!pip install -q findspark

In [5]:
!pip install -q pyspark

In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.2-bin-hadoop3.2"

# **Prática 02: Começando a trabalhar com o Spark**

**Testar se o pacote findspark foi instalado corretamente**

**Importação de dados**

In [7]:
import findspark
findspark.init()

In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [9]:
dataset = spark.read.csv('/content/sample_data/california_housing_test.csv',inferSchema=True, header =True)

In [10]:
dataset.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)



In [11]:
dataset.head()

Row(longitude=-122.05, latitude=37.37, housing_median_age=27.0, total_rooms=3885.0, total_bedrooms=661.0, population=1537.0, households=606.0, median_income=6.6085, median_house_value=344700.0)

In [12]:
spark.stop()

# **Prática 01: Práticas com MapReduce**

**Exemplo 01**

In [13]:
import numpy as np

In [14]:
from pyspark import SparkContext
spark_contexto = SparkContext() # Instanciar um SparkContext

In [15]:
vetor = np.array([10, 20, 30, 40, 50])

In [16]:
paralelo = spark_contexto.parallelize(vetor)

In [17]:
mapa = paralelo.map(lambda x : x**2+x)

In [18]:
mapa.collect()

[110, 420, 930, 1640, 2550]

**Exemplo 02**

In [19]:
paralelo = spark_contexto.parallelize(["distribuida", "distribuida", "spark", "rdd", "spark", "spark"])

In [20]:
funcao_lambda = lambda x:(x,1)

In [21]:
from operator import add
mapa = paralelo.map(funcao_lambda).reduceByKey(add).collect()

In [22]:
for (w, c) in mapa:
    print("{}: {}".format(w, c))

distribuida: 2
spark: 3
rdd: 1


In [23]:
spark_contexto.stop()

# **Prática 02: Práticas com MapReduce**

In [24]:
from pyspark import SparkContext
spark_contexto = SparkContext()

In [25]:
lista = [1, 2, 3, 4, 5, 3]
lista_rdd = spark_contexto.parallelize(lista)

In [26]:
lista_rdd.count()

6

In [27]:
par_ordenado = lambda numero: (numero, numero*10)

In [28]:
lista_rdd.flatMap(par_ordenado).collect()

[1, 10, 2, 20, 3, 30, 4, 40, 5, 50, 3, 30]

In [29]:
lista_rdd.map(par_ordenado).collect()

[(1, 10), (2, 20), (3, 30), (4, 40), (5, 50), (3, 30)]

In [30]:
spark_contexto.stop()