In [1]:
'''
http://spark.apache.org/docs/latest/cluster-overview.html
https://spark.apache.org/docs/latest/index.html
https://spark.apache.org/docs/latest/rdd-programming-guide.html
https://www.edureka.co/blog/interview-questions/top-apache-spark-interview-questions-2016/
https://www.youtube.com/playlist?list=PL9ooVrP1hQOGyFc60sExNX1qBWJyV5IMb
'''
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[*]").appName("Test").getOrCreate()
print(spark.version)

2.4.4


In [3]:
flights = spark.read.csv("c:/data/flights-larger.csv", sep=",", header=True, inferSchema=True, nullValue="NA")
print("Total number of records = ", flights.count())
flights.show(5)
flights.dtypes

Total number of records =  275000
+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 10| 10|  1|     OO|  5836|ORD| 157|  8.18|      51|   27|
|  1|  4|  1|     OO|  5866|ORD| 466|  15.5|     102| null|
| 11| 22|  1|     OO|  6016|ORD| 738|  7.17|     127|  -19|
|  2| 14|  5|     B6|   199|JFK|2248| 21.17|     365|   60|
|  5| 25|  3|     WN|  1675|SJC| 386| 12.92|      85|   22|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



[('mon', 'int'),
 ('dom', 'int'),
 ('dow', 'int'),
 ('carrier', 'string'),
 ('flight', 'int'),
 ('org', 'string'),
 ('mile', 'int'),
 ('depart', 'double'),
 ('duration', 'int'),
 ('delay', 'int')]

In [4]:
rdd1 = spark.sparkContext.parallelize([('a',7),('a',2),('b',2)])
rdd2 = spark.sparkContext.parallelize([("a",["x","y","z"]), ("b",["p", "r"])])
rdd3 = spark.sparkContext.parallelize(range(100))

In [5]:
rdd1.reduce(lambda a,b: a+b)

('a', 7, 'a', 2, 'b', 2)

In [6]:
rdd2.flatMapValues(lambda x: x).collect()

[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]

In [7]:
df = spark.read.json('C:/D/Spark-2.4.4/examples/src/main/resources/people.json')
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [8]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [9]:
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [10]:
df.select(df["name"], df["age"] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [11]:
df.filter(df["age"] > 21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [12]:
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [13]:
df.createOrReplaceTempView("people")
sqlDf = spark.sql("select * from people")
sqlDf.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [14]:
df.createOrReplaceGlobalTempView("Gpeople")
gSqlDF = spark.sql("select * from global_temp.Gpeople")
gSqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [15]:
gSqlDF1 = spark.newSession().sql("select * from global_temp.Gpeople")
gSqlDF1.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [16]:
from pyspark.sql.types import *
sc = spark.sparkContext
lines = sc.textFile('C:/D/Spark-2.4.4/examples/src/main/resources/people.txt')
parts = lines.map(lambda l: l.split(','))
parts.take(1)

[['Michael', ' 29']]

In [17]:
people = parts.map(lambda p: (p[0], p[1].strip()))
people.take(1)

[('Michael', '29')]

In [18]:
schemaString = "name age"
fields = [StructField(fieldName, StringType(), True) for fieldName in schemaString.split()]
schema = StructType(fields)
schemaPeople = spark.createDataFrame(people, schema)
schemaPeople.take(1)

[Row(name='Michael', age='29')]

In [19]:
schemaPeople.createOrReplaceTempView("peopleS")
sSqlDF = spark.sql('select * from peopleS')
sSqlDF.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+



In [20]:
parquetFile = spark.read.load("C:/D/Spark-2.4.4/examples/src/main/resources/users.parquet")
parquetFile.select('name', 'favorite_color').show()

+------+--------------+
|  name|favorite_color|
+------+--------------+
|Alyssa|          null|
|   Ben|           red|
+------+--------------+



In [21]:
parquetFile.createOrReplaceTempView('users')
users = spark.sql('select * from users')
users.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [25]:
import numpy as np
import pandas as pd
spark.conf.set('spark.sql.execution.arrow.enabled', True)
pdf = pd.DataFrame(np.random.rand(100, 3))
print(pdf.describe())

                0           1           2
count  100.000000  100.000000  100.000000
mean     0.490506    0.533859    0.497148
std      0.266359    0.289837    0.301675
min      0.003344    0.004407    0.000645
25%      0.272118    0.289179    0.241219
50%      0.480920    0.553831    0.494208
75%      0.720959    0.783475    0.769777
max      0.977570    0.990983    0.987262


In [26]:
# Pandas DataFrame to Spark DataFrame
df = spark.createDataFrame(pdf)
df.take(5)

[Row(0=0.5504192353513682, 1=0.8125350302700148, 2=0.002391349058955994),
 Row(0=0.3249891069751124, 1=0.8937847968738265, 2=0.39497862158122976),
 Row(0=0.36798210239623264, 1=0.7962814145234695, 2=0.5740302947612959),
 Row(0=0.3049209103331384, 1=0.6498452763116344, 2=0.9486840470970592),
 Row(0=0.6203759002804303, 1=0.7168155105786387, 2=0.28477211141246317)]

In [None]:
# Spark DataFrame to Pandas DataFrame
result_pdf = df.select('*').toPandas()
print(result_pdf.info())

In [30]:
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType

def multiply_func(a, b):
    return a * b

x = pd.Series([1,2,3])
print(multiply_func(x, x))

0    1
1    4
2    9
dtype: int64


In [None]:
import pandas as pd

from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType

# Declare the function and create the UDF
def multiply_func(a, b):
    return a * b

multiply = pandas_udf(multiply_func, returnType=LongType())

# The function for a pandas_udf should be able to execute with local Pandas data
x = pd.Series([1, 2, 3])
print(multiply_func(x, x))

# Create a Spark DataFrame, 'spark' is an existing SparkSession
df = spark.createDataFrame(pd.DataFrame(x, columns=["x"]))

# Execute function as a Spark vectorized UDF
df.select(multiply(col("x"), col("x"))).show()