# Big Data Fundamentals with PySpark

## Chapter 1: Introduction to Big Data analysis with Spark

In [21]:
import pyspark
from pyspark.context import SparkContext

In [22]:
# Create SparkSession from builder
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()
print(spark.sparkContext)
print("Spark App Name : "+ spark.sparkContext.appName)

<SparkContext master=local[1] appName=SparkByExamples.com>
Spark App Name : SparkByExamples.com


In [23]:
sc = spark.sparkContext

In [24]:
# Print the version of SparkContext
print("The version of Spark Context in the PySpark shell is", sc.version)

# Print the Python version of SparkContext
print("The Python version of Spark Context in the PySpark shell is", sc.pythonVer)

# Print the master of SparkContext
print("The master of Spark Context in the PySpark shell is", sc.master)

The version of Spark Context in the PySpark shell is 3.5.0
The Python version of Spark Context in the PySpark shell is 3.10
The master of Spark Context in the PySpark shell is local[1]


In [25]:
# Create a Python list of numbers from 1 to 100 
numb = range(1, 100)

# Load the list into PySpark  
spark_data = sc.parallelize(numb)
print(spark_data)

PythonRDD[14] at RDD at PythonRDD.scala:53


In [26]:
# Load a local file into PySpark shell
file_path = "README.md"
lines = sc.textFile(file_path)
print(lines)

README.md MapPartitionsRDD[16] at textFile at NativeMethodAccessorImpl.java:0


In [27]:
my_list = list(range(1, 11))

# Print my_list in the console
print("Input list is", my_list)

# Square all numbers in my_list
squared_list_lambda = list(map(lambda x: x*x, my_list))

# Print the result of the map function
print("The squared numbers are", squared_list_lambda)

Input list is [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
The squared numbers are [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [28]:
my_list2 = [10, 21, 31, 40, 51, 60, 72, 80, 93, 101]

# Print my_list2 in the console
print("Input list is:", my_list2)

# Filter numbers divisible by 10
filtered_list = list(filter(lambda x: (x%10 == 0), my_list2))

# Print the numbers divisible by 10
print("Numbers divisible by 10 are:", filtered_list)

Input list is: [10, 21, 31, 40, 51, 60, 72, 80, 93, 101]
Numbers divisible by 10 are: [10, 40, 60, 80]
