## Using Spark in Google Colab

In [0]:
# download and Java and Spark
! apt-get install openjdk-8-jdk-headless -qq > /dev/null
! wget -q https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
! tar xf spark-2.4.5-bin-hadoop2.7.tgz
! pip install -q findspark

In [0]:
# set the environment variables for spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [7]:
! python -m pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/9a/5a/271c416c1c2185b6cb0151b29a91fff6fcaed80173c8584ff6d20e46b465/pyspark-2.4.5.tar.gz (217.8MB)
[K     |████████████████████████████████| 217.8MB 65kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 51.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.5-py2.py3-none-any.whl size=218257927 sha256=68313c482b7fd80d18475a57ddc198080791bdf0f4e54b2793f6c5283c58cf9a
  Stored in directory: /root/.cache/pip/wheels/bf/db/04/61d66a5939364e756eb1c1be4ec5bdce6e04047fc7929a3c3c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.5


### Use of Spark

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

In [10]:
spark.version

'2.4.5'

In [0]:
df = spark.read.csv('sample_data/california_housing_test.csv')

In [12]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)



In [15]:
# action
df.select('_c0', '_c1').take(2)

[Row(_c0='longitude', _c1='latitude'), Row(_c0='-122.050000', _c1='37.370000')]

In [16]:
# transformation
df.select('_c0', '_c1')

DataFrame[_c0: string, _c1: string]

In [18]:
df.select('_c0', '_c1').show()

+-----------+---------+
|        _c0|      _c1|
+-----------+---------+
|  longitude| latitude|
|-122.050000|37.370000|
|-118.300000|34.260000|
|-117.810000|33.780000|
|-118.360000|33.820000|
|-119.670000|36.330000|
|-119.560000|36.510000|
|-121.430000|38.630000|
|-120.650000|35.480000|
|-122.840000|38.400000|
|-118.020000|34.080000|
|-118.240000|33.980000|
|-119.120000|35.850000|
|-121.930000|37.250000|
|-117.030000|32.970000|
|-117.970000|33.730000|
|-117.990000|33.810000|
|-120.810000|37.530000|
|-121.200000|38.690000|
|-118.880000|34.210000|
+-----------+---------+
only showing top 20 rows



In [19]:
df.show()

+-----------+---------+------------------+-----------+--------------+-----------+----------+-------------+------------------+
|        _c0|      _c1|               _c2|        _c3|           _c4|        _c5|       _c6|          _c7|               _c8|
+-----------+---------+------------------+-----------+--------------+-----------+----------+-------------+------------------+
|  longitude| latitude|housing_median_age|total_rooms|total_bedrooms| population|households|median_income|median_house_value|
|-122.050000|37.370000|         27.000000|3885.000000|    661.000000|1537.000000|606.000000|     6.608500|     344700.000000|
|-118.300000|34.260000|         43.000000|1510.000000|    310.000000| 809.000000|277.000000|     3.599000|     176500.000000|
|-117.810000|33.780000|         27.000000|3589.000000|    507.000000|1484.000000|495.000000|     5.793400|     270500.000000|
|-118.360000|33.820000|         28.000000|  67.000000|     15.000000|  49.000000| 11.000000|     6.135900|     330000.

In [21]:
df.groupBy("_c0") \
.count() \
.orderBy("count", ascending=False) \
.show(10)

+-----------+-----+
|        _c0|count|
+-----------+-----+
|-118.210000|   26|
|-118.260000|   26|
|-118.280000|   25|
|-118.290000|   25|
|-118.270000|   25|
|-118.300000|   24|
|-118.140000|   23|
|-118.350000|   22|
|-118.020000|   21|
|-118.330000|   21|
+-----------+-----+
only showing top 10 rows

