In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [4]:
!pip install -q findspark
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 64kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 45.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=94b38fdf25f7796e6b8a739a4b067dc16fa41253ca1da491dc6d0b158458e4d9
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
from pyspark import SparkContext

In [7]:
APP_NAME = "IntroToSpark"
SPARK_URL = "local[*]"
SPARK_HOME = os.environ["SPARK_HOME"]

In [8]:
sc = SparkContext(appName=APP_NAME, master=SPARK_URL, sparkHome=SPARK_HOME)

In [9]:
sc

In [10]:
def multiply_by_2(x):
  for i in x:
    yield i * 2

In [12]:
l = [1,2,3,4]
multiply_by_2(l)

<generator object multiply_by_2 at 0x7f63414aa8d0>

In [13]:
list(multiply_by_2(l))

[2, 4, 6, 8]

In [14]:
for i in multiply_by_2(l):
  print(i)

2
4
6
8


In [15]:
sample_list = sc.parallelize([1,2,3,4])

In [16]:
sample_list

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [17]:
sample_list.collect()

[1, 2, 3, 4]

In [18]:
sample_list.map(lambda x: x * 2)

PythonRDD[1] at RDD at PythonRDD.scala:53

In [19]:
sample_list.map(lambda x: x * 2).collect()

[2, 4, 6, 8]

In [20]:
def c_to_f(number):
  return (number * 9/5) + 32 

In [21]:
sample_list.map(c_to_f).collect()

[33.8, 35.6, 37.4, 39.2]

In [None]:
sample_list.filter(lambda x: x % 2 == 0).collect()

[2, 4]

In [None]:
sample_list.take(2)

[1, 2]

In [None]:
sample_list.count()

4

In [None]:
sample_list.sum()

10

In [None]:
sample_list.max()

4

In [None]:
sample_list.reduce(lambda x, y: x + y)

10

In [None]:
sample_list.saveAsTextFile("./gdrive/My Drive/Drexel Big Data/Data/list")

In [None]:
war_peace = sc.textFile("./gdrive/My Drive/Drexel Big Data/Data/2600-0.txt")

In [None]:
war_peace.take(1)

['BOOK ONE: 1805']

In [None]:
war_peace.repartition(1).saveAsTextFile('./gdrive/My Drive/Drexel Big Data/Data/war_peace')

In [None]:
sample_list.toDebugString()

b'(2) ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 []'

In [None]:
sample_list.filter(lambda x: x % 2 == 0).toDebugString()

b'(2) PythonRDD[18] at RDD at PythonRDD.scala:53 []\n |  ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 []'

In [None]:
text = sc.parallelize(['the', 'wheels', 'on', 'the', 'bus', 'go', 'round', 'and', 'round', 'all', 'through', 'the', 'town'])

In [None]:
pairs = text.map(lambda s: (s, 1))
counts = pairs.reduceByKey(lambda a, b: a + b)

In [None]:
counts.collect()

[('bus', 1),
 ('go', 1),
 ('round', 2),
 ('the', 3),
 ('wheels', 1),
 ('on', 1),
 ('and', 1),
 ('all', 1),
 ('through', 1),
 ('town', 1)]

In [None]:
list_kv = sample_list.map(lambda k: (k, 1))

In [None]:
list_kv.groupByKey().collect()

[(2, <pyspark.resultiterable.ResultIterable at 0x7fde5a6dac10>),
 (4, <pyspark.resultiterable.ResultIterable at 0x7fde5a6da610>),
 (1, <pyspark.resultiterable.ResultIterable at 0x7fde5a6da710>),
 (3, <pyspark.resultiterable.ResultIterable at 0x7fde5a6da7d0>)]

In [None]:
counts.sortByKey().collect()

[('all', 1),
 ('and', 1),
 ('bus', 1),
 ('go', 1),
 ('on', 1),
 ('round', 2),
 ('the', 3),
 ('through', 1),
 ('town', 1),
 ('wheels', 1)]

In [None]:
broadcastVar = sc.broadcast([23, 37, 59])

In [None]:
broadcastVar

<pyspark.broadcast.Broadcast at 0x7fde5a64b4d0>

In [None]:
accum = sc.accumulator(0)

In [None]:
accum

Accumulator<id=0, value=0>

In [None]:
sc.parallelize([1, 2, 3, 4]).foreach(lambda x: accum.add(x))

In [None]:
accum

Accumulator<id=0, value=10>