In [1]:
import os
import sys
os.environ['SPARK_HOME']='/home/cloudera/spark230hadoop26'
os.environ['PYLIB']=os.environ['SPARK_HOME']+'/python/lib'
sys.path.insert(0,os.environ['PYLIB']+'/py4j-0.10.6-src.zip')
sys.path.insert(1,os.environ['PYLIB']+'/pyspark.zip')
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.2.3 pyspark-shell'
os.environ['PYSPARK_PYTHON'] = '/home/cloudera/anaconda3/bin/python'

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('TestHive') \
.config('spark.warehouse.dir','/apps/hive/warehouse') \
.enableHiveSupport().getOrCreate()

In [4]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'2.3.0'

In [5]:
sc = spark.sparkContext

In [6]:
sc.setLogLevel('ERROR')

In [7]:
people = spark.createDataFrame([("Bilbo Baggins",  50), ("Gandalf", 1000), ("Thorin", 195), 
    ("Balin", 178), ("Kili", 77), ("Dwalin", 169), ("Oin", 167), 
    ("Gloin", 158), ("Fili", 82), ("Bombur", None)], ["name", "age"])

In [8]:
people.show(2)

+-------------+----+
|         name| age|
+-------------+----+
|Bilbo Baggins|  50|
|      Gandalf|1000|
+-------------+----+
only showing top 2 rows



In [None]:
people.write.format("com.mongodb.spark.sql.DefaultSource") \
.mode("append").option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017').option("database",
"people").option("collection", "contacts").save()

In [9]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri",
"mongodb://127.0.0.1/people.contacts").load()

In [10]:
df.show()

+--------------------+------+-------------+
|                 _id|   age|         name|
+--------------------+------+-------------+
|[5b3748d6ffc0ca4a...| 169.0|       Dwalin|
|[5b3748d6ffc0ca4a...| 167.0|          Oin|
|[5b3748d6ffc0ca4a...| 158.0|        Gloin|
|[5b3748d6ffc0ca4a...|  82.0|         Fili|
|[5b3748d6ffc0ca4a...|  null|       Bombur|
|[5b3748d6ffc0ca4a...|  50.0|Bilbo Baggins|
|[5b3748d6ffc0ca4a...|1000.0|      Gandalf|
|[5b3748d6ffc0ca4a...| 195.0|       Thorin|
|[5b3748d6ffc0ca4a...| 178.0|        Balin|
|[5b3748d6ffc0ca4a...|  77.0|         Kili|
|[5b3749dc361d6856...|  27.0|    test_name|
|[5b375b0dffc0ca4e...|  50.0|Bilbo Baggins|
|[5b375b0dffc0ca4e...|1000.0|      Gandalf|
|[5b375b0dffc0ca4e...| 195.0|       Thorin|
|[5b375b0dffc0ca4e...| 178.0|        Balin|
|[5b375b0dffc0ca4e...|  77.0|         Kili|
|[5b375b0dffc0ca4e...| 169.0|       Dwalin|
|[5b375b0dffc0ca4e...| 167.0|          Oin|
|[5b375b0dffc0ca4e...| 158.0|        Gloin|
|[5b375b0dffc0ca4e...|  82.0|   

In [11]:
# simple pipeline operating against the fruit collection which will just carry out a match
# and deliver back ther results
pipeline = "{'$match': {'type': 'apple'}}"
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", pipeline) \
.option('spark.mongodb.input.uri', 'mongodb://localhost:27017').option('database', 'people') \
.option('collection', 'fruit').load()
df.show()

+---+---+-----+
|_id|qty| type|
+---+---+-----+
|1.0|5.0|apple|
+---+---+-----+



In [13]:
# from the zips collection group by state and get the population sum
simple_zip_pipeline = '''{ '$group': { '_id' : '$state', totalPop: { $sum: '$pop' } } }'''

In [14]:
simple_zip_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", simple_zip_pipeline) \
.option('spark.mongodb.input.uri', 'mongodb://localhost:27017').option('database', 'people') \
.option('collection', 'zipcodes').load()
simple_zip_df.show()

+---+--------+
|_id|totalPop|
+---+--------+
| CA|29754890|
| MT|  798948|
| MS| 2573216|
| FL|12686644|
| AR| 2350725|
| GA| 6478216|
| WA| 4866692|
| SC| 3486703|
| MN| 4372982|
| NE| 1578139|
| MD| 4781379|
| TN| 4876457|
| DE|  666168|
| DC|  606900|
| AZ| 3665228|
| ME| 1226648|
| OR| 2842321|
| AL| 4040587|
| PA|11881643|
| RI| 1003218|
+---+--------+
only showing top 20 rows



In [15]:
# frofm the zips collection after aggregating population for states find those with pop > 10 mn
zip_pipeline = '''[{ '$group': { '_id' : '$state', totalPop: { $sum: '$pop' } } }, 
 { '$match': { totalPop: { $gte: 10000000 } }}]'''

In [16]:
zip_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", zip_pipeline) \
.option('spark.mongodb.input.uri', 'mongodb://localhost:27017').option('database', 'people') \
.option('collection', 'zipcodes').load()
zip_df.show()

+---+--------+
|_id|totalPop|
+---+--------+
| CA|29754890|
| FL|12686644|
| PA|11881643|
| NY|17990402|
| OH|10846517|
| IL|11427576|
| TX|16984601|
+---+--------+



In [62]:
avg_city_pop_pipeline = '''[{ $group: { _id: { state: "$state", city: "$city" }, pop: { $sum: "$pop" } } },
   { $group: { _id: "$_id.state", nocities: {$sum: 1}, avgCityPop: { $avg: "$pop" } } },
   { $sort: {avgCityPop: -1} }
]'''

In [63]:
avg_city_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", avg_city_pop_pipeline) \
.option('spark.mongodb.input.uri', 'mongodb://localhost:27017').option('database', 'people') \
.option('collection', 'zipcodes').load()
avg_city_df.show()

+---+------------------+--------+
|_id|        avgCityPop|nocities|
+---+------------------+--------+
| DC|          303450.0|       2|
| CA| 27756.42723880597|    1072|
| FL|27400.958963282937|     463|
| AZ| 20591.16853932584|     178|
| RI|19292.653846153848|      52|
| NV|18209.590909090908|      66|
| HI|15831.842857142858|      70|
| NJ| 15775.89387755102|     490|
| MA| 14855.37037037037|     405|
| CT|         14674.625|     224|
| DE| 14481.91304347826|      46|
| TX| 13775.02108678021|    1233|
| NY|13131.680291970803|    1370|
| OH|12700.839578454332|     854|
| MD|12615.775725593667|     379|
| WA|12258.670025188916|     397|
| MI|12087.512353706112|     769|
| GA| 11547.62210338681|     561|
| SC|11139.626198083068|     313|
| NC|10622.815705128205|     624|
+---+------------------+--------+
only showing top 20 rows



In [19]:
df_for_sql = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri",
"mongodb://127.0.0.1/people.zipcodes").load()

In [26]:
df_for_sql.createOrReplaceTempView('ziptbl')

In [27]:
spark.sql('select * from ziptbl limit 5').show()

+-----+-----------+--------------------+-----+-----+
|  _id|       city|                 loc|  pop|state|
+-----+-----------+--------------------+-----+-----+
|01001|     AGAWAM|[-72.622739, 42.0...|15338|   MA|
|01002|    CUSHMAN|[-72.51565, 42.37...|36963|   MA|
|01005|      BARRE|[-72.108354, 42.4...| 4546|   MA|
|01007|BELCHERTOWN|[-72.410953, 42.2...|10579|   MA|
|01008|  BLANDFORD|[-72.936114, 42.1...| 1240|   MA|
+-----+-----------+--------------------+-----+-----+



In [43]:
spark.sql('select state, city, sum(pop) as avgpop from ziptbl group by state, city').show()

+-----+---------------+------+
|state|           city|avgpop|
+-----+---------------+------+
|   MA|   CHESTERFIELD|   177|
|   NH|          GONIC|  4474|
|   ME|           SACO| 16192|
|   VT|UNIV OF VERMONT|     0|
|   NJ|       PATERSON|141382|
|   NJ|  THREE BRIDGES|   378|
|   NY|      ROSENDALE|  2939|
|   NY|      CONSTABLE|  1949|
|   NY|     GEORGETOWN|   611|
|   NY|      CLAYVILLE|   641|
|   NY|          EATON|  1583|
|   NY|    OSWEGATCHIE|   287|
|   NY|   GREAT VALLEY|  2217|
|   NY|       REXVILLE|   539|
|   PA|      LAMBERTON|  4703|
|   PA|     GRAPEVILLE|   683|
|   PA| NEW ENTERPRISE|  1898|
|   PA| ST CLAIRSVILLE|   174|
|   PA|       NEW PARK|  1190|
|   PA| NORTHUMBERLAND|  7326|
+-----+---------------+------+
only showing top 20 rows



In [45]:
spark.sql('''select state, count(avgpop) as nocities, avg(avgpop) as avgcitypop from 
            ( select state, city, sum(pop) as avgpop from ziptbl group by state, city) as f
            group by state order by avgcitypop desc''').show()

+-----+--------+------------------+
|state|nocities|        avgcitypop|
+-----+--------+------------------+
|   DC|       2|          303450.0|
|   CA|    1072| 27756.42723880597|
|   FL|     463|27400.958963282937|
|   AZ|     178| 20591.16853932584|
|   RI|      52|19292.653846153848|
|   NV|      66|18209.590909090908|
|   HI|      70|15831.842857142858|
|   NJ|     490| 15775.89387755102|
|   MA|     405| 14855.37037037037|
|   CT|     224|         14674.625|
|   DE|      46| 14481.91304347826|
|   TX|    1233| 13775.02108678021|
|   NY|    1370|13131.680291970803|
|   OH|     854|12700.839578454332|
|   MD|     379|12615.775725593667|
|   WA|     397|12258.670025188916|
|   MI|     769|12087.512353706112|
|   GA|     561| 11547.62210338681|
|   SC|     313|11139.626198083068|
|   NC|     624|10622.815705128205|
+-----+--------+------------------+
only showing top 20 rows



In [66]:
lscityzip = '''[
   { $group:
      {
        _id: { state: "$state", city: "$city" },
        pop: { $sum: "$pop" }
      }
   },
   { $sort: { pop: 1 } },
   { $group:
      {
        _id : "$_id.state",
        biggestCity:  { $last: "$_id.city" },
        biggestPop:   { $last: "$pop" },
        smallestCity: { $first: "$_id.city" },
        smallestPop:  { $first: "$pop" }
      }
   },
  { $project:
    { _id: 0,
      state: "$_id",
      biggestCity:  { name: "$biggestCity",  pop: "$biggestPop" },
      smallestCity: { name: "$smallestCity", pop: "$smallestPop" }
    }
  }
]'''

In [67]:
ls_city_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("pipeline", lscityzip) \
.option('spark.mongodb.input.uri', 'mongodb://localhost:27017').option('database', 'people') \
.option('collection', 'zipcodes').load()
ls_city_df.show()

+--------------------+--------------------+-----+
|         biggestCity|        smallestCity|state|
+--------------------+--------------------+-----+
|    [NEWARK, 111674]|       [BETHEL, 108]|   DE|
|   [JACKSON, 204788]|        [CHUNKY, 79]|   MS|
|  [CRANSTON, 176404]|     [CLAYVILLE, 45]|   RI|
|[SAINT LOUIS, 397...|      [BENDAVIS, 44]|   MO|
|     [MIAMI, 825232]|[CECIL FIELD NAS, 0]|   FL|
|[LITTLE ROCK, 192...|         [TOMATO, 0]|   AR|
|   [ATLANTA, 609591]|   [FORT STEWART, 0]|   GA|
| [BURLINGTON, 39127]|[UNIV OF VERMONT, 0]|   VT|
|[ALBUQUERQUE, 449...|       [MONUMENT, 0]|   NM|
|[PHILADELPHIA, 16...|       [HAMILTON, 0]|   PA|
|   [WICHITA, 295115]|         [ARNOLD, 0]|   KS|
|[MINNEAPOLIS, 344...|       [JOHNSON, 12]|   MN|
|[LOS ANGELES, 210...|   [OREGON HOUSE, 0]|   CA|
|   [BILLINGS, 78805]|      [HOMESTEAD, 7]|   MT|
|[BIRMINGHAM, 242606]|          [ALLEN, 0]|   AL|
|  [PORTLAND, 518543]|           [KENT, 0]|   OR|
| [BALTIMORE, 733081]|[ANNAPOLIS JUNCTI...|   MD|
