In [1]:
import os
cwd = os.getcwd()
cwd

'/home/jovyan'

In [7]:
# 1. Compute value of PI using Monte Carlo Approach
# Source: http://spark.apache.org/examples.html

import pyspark
import random
if not 'sc' in globals():
    sc = pyspark.SparkContext() #create SparkContext if not exist

    
NUM_SAMPLES = 1000000

def inside(p):
    x, y = random.random(), random.random()
    return x*x + y*y < 1

# parallelize: http://spark.apache.org/docs/latest/api/python/pyspark.html?highlight=parallelize
# filter: http://spark.apache.org/docs/latest/api/python/pyspark.html
count = sc.parallelize(range(0, NUM_SAMPLES),10) \
             .filter(inside).count() #create 10 RDD  #implement function 

print("Pi is roughly %f" % (4.0 * count / NUM_SAMPLES))

Pi is roughly 3.144476


In [15]:
# 2. Word Count using Spark

text_file = sc.textFile("./sample.txt") #Can be a hdfs://.. path also
counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)

counts.saveAsTextFile("./count_output1") # can be a hdfs://... directory/path also

# $ cat count_output/*
# $ ls -lrt count_output : Multiple part files

In [17]:
! cat count_output1/*

('India', 2)
('Updates:', 1)
('Abdullah,', 1)
('tested', 1)
('positive', 1)
('Covid-19,', 1)
('admitted', 1)
('hospital,', 1)
('says', 1)
('son', 1)
('trade', 1)
('current', 1)
('Pakistan', 1)
('Coronavirus', 1)
('Live', 1)
('Farooq', 1)
('who', 1)
('for', 1)
('to', 1)
('Omar.', 1)
('No', 1)
('with', 1)
('under', 1)
('circumstances:', 1)
('PM.', 1)


In [16]:
# rerun error: file exists


In [26]:
#FlatMap vs Map
o1 = text_file.flatMap(lambda line: line.split(" ")); #flatmap flattd into one big list
o1.collect()

['Hadoop',
 'is',
 'the',
 'Elephant',
 'King!',
 'A',
 'yellow',
 'and',
 'elegant',
 'thing.',
 'He',
 'never',
 'forgets',
 'Useful',
 'data,',
 'or',
 'lets',
 'C',
 'A',
 'wonderful',
 'king',
 'is',
 'Hadoop.',
 'The',
 'elephant',
 'plays',
 'well',
 'with',
 'Sqoop.',
 'But',
 'what',
 'helps',
 'him',
 'to',
 'thrive',
 'Are',
 'Impala,',
 'and',
 'Hive,',
 'And',
 'HDFS',
 'in',
 'the',
 'group.',
 'HadoopFile2.txt:',
 '',
 'Hadoop',
 'is',
 'an',
 'elegant',
 'fellow.',
 'An',
 'elephant',
 'gentle',
 'and',
 'mellow.',
 'He',
 'never',
 'gets',
 'mad,',
 'Or',
 'does',
 'anything',
 'bad,',
 'A',
 'A',
 'A',
 'Because,',
 'at',
 'his',
 'core,',
 'he',
 'is',
 'yellow.',
 'An',
 'extraneous',
 'element',
 'cling!']

In [27]:
o1a = text_file.map(lambda line: line.split(" "));
o1a.collect()

[['Hadoop', 'is', 'the', 'Elephant', 'King!'],
 ['A', 'yellow', 'and', 'elegant', 'thing.'],
 ['He', 'never', 'forgets'],
 ['Useful', 'data,', 'or', 'lets'],
 ['C'],
 ['A', 'wonderful', 'king', 'is', 'Hadoop.'],
 ['The', 'elephant', 'plays', 'well', 'with', 'Sqoop.'],
 ['But', 'what', 'helps', 'him', 'to', 'thrive'],
 ['Are', 'Impala,', 'and', 'Hive,'],
 ['And', 'HDFS', 'in', 'the', 'group.'],
 ['HadoopFile2.txt:'],
 [''],
 ['Hadoop', 'is', 'an', 'elegant', 'fellow.'],
 ['An', 'elephant', 'gentle', 'and', 'mellow.'],
 ['He', 'never', 'gets', 'mad,'],
 ['Or', 'does', 'anything', 'bad,'],
 ['A'],
 ['A'],
 ['A'],
 ['Because,', 'at', 'his', 'core,', 'he', 'is', 'yellow.'],
 ['An', 'extraneous', 'element', 'cling!']]

In [None]:
# flatMap: Similar to map, it returns a new RDD by applying a function to each element of the RDD,
# but output is flattened.

In [28]:
o2 = o1.map(lambda word: (word, 1))
o2.collect()

[('Hadoop', 1),
 ('is', 1),
 ('the', 1),
 ('Elephant', 1),
 ('King!', 1),
 ('A', 1),
 ('yellow', 1),
 ('and', 1),
 ('elegant', 1),
 ('thing.', 1),
 ('He', 1),
 ('never', 1),
 ('forgets', 1),
 ('Useful', 1),
 ('data,', 1),
 ('or', 1),
 ('lets', 1),
 ('C', 1),
 ('A', 1),
 ('wonderful', 1),
 ('king', 1),
 ('is', 1),
 ('Hadoop.', 1),
 ('The', 1),
 ('elephant', 1),
 ('plays', 1),
 ('well', 1),
 ('with', 1),
 ('Sqoop.', 1),
 ('But', 1),
 ('what', 1),
 ('helps', 1),
 ('him', 1),
 ('to', 1),
 ('thrive', 1),
 ('Are', 1),
 ('Impala,', 1),
 ('and', 1),
 ('Hive,', 1),
 ('And', 1),
 ('HDFS', 1),
 ('in', 1),
 ('the', 1),
 ('group.', 1),
 ('HadoopFile2.txt:', 1),
 ('', 1),
 ('Hadoop', 1),
 ('is', 1),
 ('an', 1),
 ('elegant', 1),
 ('fellow.', 1),
 ('An', 1),
 ('elephant', 1),
 ('gentle', 1),
 ('and', 1),
 ('mellow.', 1),
 ('He', 1),
 ('never', 1),
 ('gets', 1),
 ('mad,', 1),
 ('Or', 1),
 ('does', 1),
 ('anything', 1),
 ('bad,', 1),
 ('A', 1),
 ('A', 1),
 ('A', 1),
 ('Because,', 1),
 ('at', 1),
 ('his'

In [29]:
o3 = o2.reduceByKey(lambda a, b: a + b)
o3.collect()

[('is', 4),
 ('yellow', 1),
 ('elegant', 2),
 ('thing.', 1),
 ('never', 2),
 ('forgets', 1),
 ('Useful', 1),
 ('C', 1),
 ('wonderful', 1),
 ('king', 1),
 ('Hadoop.', 1),
 ('The', 1),
 ('plays', 1),
 ('But', 1),
 ('Impala,', 1),
 ('Hive,', 1),
 ('And', 1),
 ('HDFS', 1),
 ('in', 1),
 ('group.', 1),
 ('', 1),
 ('an', 1),
 ('fellow.', 1),
 ('gentle', 1),
 ('mellow.', 1),
 ('Or', 1),
 ('anything', 1),
 ('at', 1),
 ('his', 1),
 ('he', 1),
 ('Hadoop', 2),
 ('the', 2),
 ('Elephant', 1),
 ('King!', 1),
 ('A', 5),
 ('and', 3),
 ('He', 2),
 ('data,', 1),
 ('or', 1),
 ('lets', 1),
 ('elephant', 2),
 ('well', 1),
 ('with', 1),
 ('Sqoop.', 1),
 ('what', 1),
 ('helps', 1),
 ('him', 1),
 ('to', 1),
 ('thrive', 1),
 ('Are', 1),
 ('HadoopFile2.txt:', 1),
 ('An', 2),
 ('gets', 1),
 ('mad,', 1),
 ('does', 1),
 ('bad,', 1),
 ('Because,', 1),
 ('core,', 1),
 ('yellow.', 1),
 ('extraneous', 1),
 ('element', 1),
 ('cling!', 1)]

In [35]:
# 3. Spark DataFrame API
# Refer:https://spark.apache.org/docs/latest/sql-getting-started.html

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate() # get the sparkSession if it exists or else create it

# spark is an existing SparkSession
df = spark.read.json("./people.json")

# Displays the content of the DataFrame to stdout
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [None]:
# Data Sources: https://spark.apache.org/docs/latest/sql-data-sources.html

In [36]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [37]:
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [38]:
df.select(df['name'], df['age'] + 1).show()


+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [39]:
df.filter(df['age'] > 21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [40]:
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [41]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [42]:
sqlDF = spark.sql("SELECT * FROM people1")
sqlDF.show()

AnalysisException: Table or view not found: people1; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [people1]


In [43]:
df.createGlobalTempView("people")

# Global temporary view is tied to a system preserved database `global_temp`
spark.sql("SELECT * FROM global_temp.people").show()


# Global temporary view is cross-session
spark.newSession().sql("SELECT * FROM global_temp.people").show()


+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [49]:
#4. Vectors and Correaltion Coefficient
# https://spark.apache.org/docs/latest/ml-statistics.html

from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

# https://spark.apache.org/docs/1.1.0/api/python/pyspark.mllib.linalg.Vectors-class.html
s1 = Vectors.sparse(4, [(0, 1.0), (3, -2.0)] )
print(s1)
print(Vectors.dense(s1))
      




(4,[0,3],[1.0,-2.0])
[1.0,0.0,0.0,-2.0]


In [50]:
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
df = spark.createDataFrame(data, ["features"])
df.show()

+--------------------+
|            features|
+--------------------+
|(4,[0,3],[1.0,-2.0])|
|   [4.0,5.0,0.0,3.0]|
|   [6.0,7.0,0.0,8.0]|
| (4,[0,3],[9.0,1.0])|
+--------------------+



In [51]:
r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))


Pearson correlation matrix:
DenseMatrix([[1.        , 0.05564149,        nan, 0.40047142],
             [0.05564149, 1.        ,        nan, 0.91359586],
             [       nan,        nan, 1.        ,        nan],
             [0.40047142, 0.91359586,        nan, 1.        ]])


In [53]:
print(r1)

Row(pearson(features)=DenseMatrix(4, 4, [1.0, 0.0556, nan, 0.4005, 0.0556, 1.0, nan, 0.9136, nan, nan, 1.0, nan, 0.4005, 0.9136, nan, 1.0], False))


In [52]:
r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))

Spearman correlation matrix:
DenseMatrix([[1.        , 0.10540926,        nan, 0.4       ],
             [0.10540926, 1.        ,        nan, 0.9486833 ],
             [       nan,        nan, 1.        ,        nan],
             [0.4       , 0.9486833 ,        nan, 1.        ]])
