<a href="https://colab.research.google.com/github/pstorniolo/Master2021/blob/main/How_to_Install_Spark_3_2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Java 11
!apt-get -q install openjdk-11-jdk-headless

Reading package lists...
Building dependency tree...
Reading state information...
openjdk-11-jdk-headless is already the newest version (11.0.11+9-0ubuntu2~18.04).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [2]:
#Install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz

In [3]:
#Unzip the spark file to the current folder
!tar xf spark-3.2.0-bin-hadoop3.2.tgz

In [4]:
!rm -f spark-3.2.0-bin-hadoop3.2.tgz ; ls -la

total 20
drwxr-xr-x  1 root root 4096 Oct 24 07:18 .
drwxr-xr-x  1 root root 4096 Oct 24 07:13 ..
drwxr-xr-x  4 root root 4096 Oct  8 13:44 .config
drwxr-xr-x  1 root root 4096 Oct  8 13:45 sample_data
drwxr-xr-x 13 1000 1000 4096 Oct  6 13:18 spark-3.2.0-bin-hadoop3.2


In [5]:
!ls -la sample_data/

total 55512
drwxr-xr-x 1 root root     4096 Oct  8 13:45 .
drwxr-xr-x 1 root root     4096 Oct 24 07:18 ..
-rwxr-xr-x 1 root root     1697 Jan  1  2000 anscombe.json
-rw-r--r-- 1 root root   301141 Oct  8 13:45 california_housing_test.csv
-rw-r--r-- 1 root root  1706430 Oct  8 13:45 california_housing_train.csv
-rw-r--r-- 1 root root 18289443 Oct  8 13:45 mnist_test.csv
-rw-r--r-- 1 root root 36523880 Oct  8 13:45 mnist_train_small.csv
-rwxr-xr-x 1 root root      930 Jan  1  2000 README.md


In [6]:
#Set your spark folder to your system path environment.
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

In [7]:
#Install findspark using pip to make pyspark importable as regular library
!pip -q install findspark
import findspark
findspark.init()

#Spark for Python (pyspark)
#!pip -q install pyspark==3.2.0

In [8]:
#importing pyspark
import pyspark

#importing sparksession
from pyspark.sql import SparkSession

In [9]:
#creating a sparksession object and providing appName 
spark=SparkSession.builder.appName("local").getOrCreate()

In [10]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled","true")
spark.conf.set("spark.sql.execution.arrow.pyspark.selfDestruct.enabled","true")

#printing the version of spark
print("Apache Spark version: ", spark.version)

Apache Spark version:  3.2.0


---

In [11]:
# create a list of random numbers between 10 to 1000
from random import randint
my_large_list = [randint(10,1000) for x in range(0,5000000)]

In [12]:
# create one partition of the list  
my_large_list_one_partition = spark.sparkContext.parallelize(my_large_list,numSlices=1)

# check number of partitions
print(my_large_list_one_partition.getNumPartitions())

# filter numbers greater than equal to 200
my_large_list_one_partition = my_large_list_one_partition.filter(lambda x : x >= 200)

# count the number of elements in filtered list
print(my_large_list_one_partition.count())

1
4040494


In [13]:
# create partitions of the list
my_large_list_with_partition = spark.sparkContext.parallelize(my_large_list, numSlices=10)

# check number of partitions
print(my_large_list_with_partition.getNumPartitions())

# filter numbers greater than equal to 200
my_large_list_with_partition = my_large_list_with_partition.filter(lambda x : x >= 200)

# count the number of elements in the filtered list
print(my_large_list_with_partition.count())

10
4040494


In [14]:
# create a sample list
my_list = [i for i in range(1,10000000)]

# parallelize the data
rdd_0 = spark.sparkContext.parallelize(my_list,3)

print(rdd_0)
rdd_0.take(10)

ParallelCollectionRDD[4] at readRDDFromFile at PythonRDD.scala:274


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [15]:
# add value 4 to each number
rdd_1 = rdd_0.map(lambda x: x + 4)

# RDD object
print(rdd_1)
rdd_1.take(10)

PythonRDD[6] at RDD at PythonRDD.scala:53


[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [16]:
# add value 20 each number
rdd_2 = rdd_1.map(lambda x : x + 20)

# RDD Object
print(rdd_2)
rdd_2.take(10)

PythonRDD[8] at RDD at PythonRDD.scala:53


[25, 26, 27, 28, 29, 30, 31, 32, 33, 34]