<a href="https://colab.research.google.com/github/ralsouza/apache_spark_real_time_analytics/blob/master/notebooks/01_pyspark_introduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install

In [11]:
# instalar as dependências
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [12]:
# configurar as variáveis de ambiente
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
 
# tornar o pyspark "importável"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [None]:
# iniciar uma sessão local e importar dados do Airbnb
# from pyspark.sql import SparkSession
# sc = SparkSession.builder.master('local[*]').getOrCreate()
 
# download do http para arquivo local
# !wget --quiet --show-progress http://data.insideairbnb.com/brazil/rj/rio-de-janeiro/2019-07-15/visualisations/listings.csv
 
# carregar dados do Airbnb
# df_spark = sc.read.csv("./listings.csv", inferSchema=True, header=True)
 
# ver algumas informações sobre os tipos de dados de cada coluna
# df_spark.printSchema()

# 2. Pyspark Introduction

In [19]:
from pyspark import SparkContext
sc = SparkContext("local[*]", "My First App")


In [18]:
# Stoping Context
# sc.stop()

In [20]:
import sys
print(sys.version)

3.6.9 (default, Apr 18 2020, 01:56:04) 
[GCC 8.4.0]


In [21]:
# Print session context (Spark Context)
print(sc)

<SparkContext master=local[*] appName=My First App>


In [22]:
# Version context
print(sc.version)

2.4.4


In [23]:
# Testing Spark and creating a RDD
# We can't put a Python list in a Spark cluster, it's needed to convert it to 
# a RDD
lst = [25,90,81,37,776,3320]
test_data = sc.parallelize(lst,10)

In [24]:
# What does sc.parallelize?
?sc.parallelize

# Signature: sc.parallelize(c, numSlices=None)
# Docstring:
# Distribute a local Python collection to form an RDD (Resilient Distribuited 
# Dataset). 
# Using xrange
# is recommended if the input represents a range for performance.

# >>> sc.parallelize([0, 2, 3, 4, 6], 5).glom().collect()
# [[0], [2], [3], [4], [6]]
# >>> sc.parallelize(xrange(0, 6, 2), 5).glom().collect()
# [[], [0], [], [2], [4]]
# File:      /content/spark-2.4.4-bin-hadoop2.7/python/pyspark/context.py
# Type:      method

In [25]:
# Check data type
type(test_data)

pyspark.rdd.RDD

In [26]:
# Counting data
test_data.count()

6

In [27]:
# List values
test_data.collect()

[25, 90, 81, 37, 776, 3320]