In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession 
    .builder 
    .master('local[*]')
    .appName("training") 
    .getOrCreate()
)

# Create a spark RRD
(resilient distributed dataset)

In [3]:
rrd = spark.sparkContext.parallelize([i**2 for i in range(20)])

In [4]:
rrd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:262

In [5]:
rrd.count()

20

In [6]:
rrd.sum()

2470

In [7]:
rrd_hamlet = spark.sparkContext.textFile('./hamlet.txt')
rrd_hamlet

./hamlet.txt MapPartitionsRDD[4] at textFile at NativeMethodAccessorImpl.java:0

In [8]:
total_words = (
    rrd_hamlet
    .map(lambda line: line.split())   # transformation
    .map(lambda words: len(words))    # transformation
    .sum()                            # action
)
total_lines = rrd_hamlet.count()
words_per_line = total_words / total_lines
words_per_line

10.77878787878788

# Create a spark dataframe

In [9]:
df = spark.read.csv('./Sacramentorealestatetransactions.csv', header=True)

In [10]:
df.show()

+--------------------+--------------+-----+-----+----+-----+------+-----------+--------------------+------+---------+-----------+
|              street|          city|  zip|state|beds|baths|sq__ft|       type|           sale_date| price| latitude|  longitude|
+--------------------+--------------+-----+-----+----+-----+------+-----------+--------------------+------+---------+-----------+
|        3526 HIGH ST|    SACRAMENTO|95838|   CA|   2|    1|   836|Residential|Wed May 21 00:00:...| 59222|38.631913|-121.434879|
|         51 OMAHA CT|    SACRAMENTO|95823|   CA|   3|    1|  1167|Residential|Wed May 21 00:00:...| 68212|38.478902|-121.431028|
|      2796 BRANCH ST|    SACRAMENTO|95815|   CA|   2|    1|   796|Residential|Wed May 21 00:00:...| 68880|38.618305|-121.443839|
|    2805 JANETTE WAY|    SACRAMENTO|95815|   CA|   2|    1|   852|Residential|Wed May 21 00:00:...| 69307|38.616835|-121.439146|
|     6001 MCMAHON DR|    SACRAMENTO|95824|   CA|   2|    1|   797|Residential|Wed May 21 

In [11]:
df.describe()

DataFrame[summary: string, street: string, city: string, zip: string, state: string, beds: string, baths: string, sq__ft: string, type: string, sale_date: string, price: string, latitude: string, longitude: string]

In [12]:
df.schema

StructType(List(StructField(street,StringType,true),StructField(city,StringType,true),StructField(zip,StringType,true),StructField(state,StringType,true),StructField(beds,StringType,true),StructField(baths,StringType,true),StructField(sq__ft,StringType,true),StructField(type,StringType,true),StructField(sale_date,StringType,true),StructField(price,StringType,true),StructField(latitude,StringType,true),StructField(longitude,StringType,true)))

# Convert the dataframe to a view for use with Spark SQL

In [13]:
df.createOrReplaceTempView('Sacramento')

In [16]:
df2 = spark.sql('SELECT zip, COUNT(*) as count, MEAN(price) AS mean_price FROM Sacramento GROUP BY zip')
df2.show()

+-----+-----+------------------+
|  zip|count|        mean_price|
+-----+-----+------------------+
|95834|   22|248426.27272727274|
|95667|   10|          363863.4|
|95662|   11|279159.54545454547|
|95670|   21| 236060.2857142857|
|95626|    4|          132866.0|
|95690|    1|          380000.0|
|95722|    1|          230000.0|
|95630|   17|414960.17647058825|
|95832|   12|175196.83333333334|
|95826|   18|181119.94444444444|
|95831|   10|          313271.0|
|95742|   11| 350009.0909090909|
|95608|   20|         295684.75|
|95631|    1|          194818.0|
|95841|    7|213806.14285714287|
|95815|   18|          115133.0|
|95824|   12|114467.58333333333|
|95864|    5|          364400.0|
|95682|   10|          268650.0|
|95757|   36| 338334.5833333333|
+-----+-----+------------------+
only showing top 20 rows



# Integration with Pandas

Convert to a Pandas dataframe...

In [17]:
pd_df = df2.toPandas()
pd_df

Unnamed: 0,zip,count,mean_price
0,95834,22,248426.272727
1,95667,10,363863.400000
2,95662,11,279159.545455
3,95670,21,236060.285714
4,95626,4,132866.000000
...,...,...,...
63,95819,4,465750.000000
64,95823,61,175243.049180
65,95828,45,184676.955556
66,95624,34,250743.676471


and back...

In [18]:
spark.createDataFrame(pd_df).show()

+-----+-----+------------------+
|  zip|count|        mean_price|
+-----+-----+------------------+
|95834|   22|248426.27272727274|
|95667|   10|          363863.4|
|95662|   11|279159.54545454547|
|95670|   21| 236060.2857142857|
|95626|    4|          132866.0|
|95690|    1|          380000.0|
|95722|    1|          230000.0|
|95630|   17|414960.17647058825|
|95832|   12|175196.83333333334|
|95826|   18|181119.94444444444|
|95831|   10|          313271.0|
|95742|   11| 350009.0909090909|
|95608|   20|         295684.75|
|95631|    1|          194818.0|
|95841|    7|213806.14285714287|
|95815|   18|          115133.0|
|95824|   12|114467.58333333333|
|95864|    5|          364400.0|
|95682|   10|          268650.0|
|95757|   36| 338334.5833333333|
+-----+-----+------------------+
only showing top 20 rows

