# Mouting Drive in Google Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Installing Apache Spark

In [None]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!ls

drive				 spark-3.2.1-bin-hadoop3.2.tgz.2
sample_data			 spark-3.2.1-bin-hadoop3.2.tgz.3
spark-3.2.1-bin-hadoop3.2	 spark-3.2.1-bin-hadoop3.2.tgz.4
spark-3.2.1-bin-hadoop3.2.tgz	 spark-3.2.1-bin-hadoop3.2.tgz.5
spark-3.2.1-bin-hadoop3.2.tgz.1


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

import findspark
findspark.init()
findspark.find()

'/content/spark-3.2.1-bin-hadoop3.2'

# Creating an Apache Spark section

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

In [None]:
spark

In [None]:
!pwd

/content


# Creating my first RDDs

In [None]:
file_path="/content/drive/MyDrive/Seminar_MUIA/walkthrough_examples/"
sc = spark.sparkContext
input_rdds = sc.textFile(file_path+"inputfile.txt")
input_rdds.count()

5

# WordCount

In [None]:
wordCounts = input_rdd.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
output=wordCounts.collect()
for (word, count) in output:
  print("%s: %i" % (word, count))

this: 1
is: 1
flatmap: 1
learning: 1
Enjoy: 1
part: 1
for: 1
beginners: 1
to: 1
learn: 1
spark: 1
and: 2
the: 1
difference: 1
between: 1
map: 1
happy: 1


# Spark Pi

In [10]:
from random import random
from operator import add
partitions = 10
n = 100000 * partitions

def f(_: int) -> float:
        x = random() * 2 - 1
        y = random() * 2 - 1
        return 1 if x ** 2 + y ** 2 <= 1 else 0

count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))

    

Pi is roughly 3.134880


# Lab 1