In [None]:
import pyspark
from pyspark.sql import SparkSession

# Create a Spark session (Control center of Spark functionality)
spark = SparkSession.builder.appName("MySparkApp").getOrCreate()

In [None]:
spark.range(10).explain()
spark.range(10).show()

In [None]:
# Read a CSV file with header and infer schema
# inferSchema=True tells Spark to automatically detect data types, otherwise all columns are read as strings
df = spark.read.csv(
    "201508_trip_data.csv",
    header=True,
    inferSchema=True
)

df.show(5)

In [None]:
# Print Spark configuration settings and version
print(spark.conf.get("spark.app.name"))
print(spark.conf.get("spark.sql.shuffle.partitions"))
print(spark.conf.get("spark.sql.files.maxPartitionBytes"))
print(spark.version)

In [None]:
# Create a DataFrame with a range of numbers from 0 to 999
# Number is the name of the single column in the DataFrame (By default it's "id")
myRange = spark.range(1000).toDF("number")
myRange.show(5)

# Data structure are immutable

This is an example of a simple transformation
We can have to types of transformation

- narrow -> each input partition results in one output
- wide -> input partitions contribute to many output partitions


In [None]:
divisBy2 = myRange.where("number % 2 = 0") # narrowing transformation
divisBy2.show(5)
divisBy2.count()


# Actions

There are 3 types of actions

- to view data in console
- to collect data to native objects in their respective language
- to write to output data sources


# Spark UI

Monitor the progress of the job at:
http://localhost:4040

Or run spark.sparkContext.uiWebUrl


In [None]:
spark.sparkContext.uiWebUrl

In [None]:
# Stop the Spark session - good practice to free up resources on low-RAM machines
spark.stop()