In [None]:
#from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
my_spark = SparkSession \
    .builder \
    .appName("Python Spark SQL example") \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
data = [
    ("2015-05-14 03:53:00", "WARRANT ARREST"),
    ("2015-05-14 03:53:00", "TRAFFIC VIOLATION"),
    ("2015-05-14 03:33:00", "TRAFFIC VIOLATION")
]

df1 = my_spark.createDataFrame(data, ["Dates", "Description"])
df1.show()

+-------------------+-----------------+
|              Dates|      Description|
+-------------------+-----------------+
|2015-05-14 03:53:00|   WARRANT ARREST|
|2015-05-14 03:53:00|TRAFFIC VIOLATION|
|2015-05-14 03:33:00|TRAFFIC VIOLATION|
+-------------------+-----------------+



In [None]:
df_2 = df1.withColumn('wordCount', f.size(f.split(f.col('Description'), ' ')))
df_2.show()

+-------------------+-----------------+---------+
|              Dates|      Description|wordCount|
+-------------------+-----------------+---------+
|2015-05-14 03:53:00|   WARRANT ARREST|        2|
|2015-05-14 03:53:00|TRAFFIC VIOLATION|        2|
|2015-05-14 03:33:00|TRAFFIC VIOLATION|        2|
+-------------------+-----------------+---------+



In [None]:
df_2.show()
df_3 = df_2.withColumn('words',f.explode(f.split(f.col('Description'),' '))) 

+-------------------+-----------------+---------+
|              Dates|      Description|wordCount|
+-------------------+-----------------+---------+
|2015-05-14 03:53:00|   WARRANT ARREST|        2|
|2015-05-14 03:53:00|TRAFFIC VIOLATION|        2|
|2015-05-14 03:33:00|TRAFFIC VIOLATION|        2|
+-------------------+-----------------+---------+

+---------+
|    words|
+---------+
|  WARRANT|
|   ARREST|
|  TRAFFIC|
|VIOLATION|
|  TRAFFIC|
|VIOLATION|
+---------+



In [None]:
df_3.select('words') \
    .groupBy(f.col('words')) \
    .agg(
       f.count(f.lit(1)).alias('WordCount')
  ).show()

+---------+---------+
|    words|WordCount|
+---------+---------+
|  WARRANT|        1|
|   ARREST|        1|
|  TRAFFIC|        2|
|VIOLATION|        2|
+---------+---------+



In [None]:
sc = spark.sparkContext

data = [
    (2, 4),
    (3, 9),
    (4, 16)
]

squaresDF = my_spark.createDataFrame(data, ["Number", "Sqr"])
                                 
squaresDF.show()

+------+---+
|Number|Sqr|
+------+---+
|     2|  4|
|     3|  9|
|     4| 16|
+------+---+



In [None]:
squaresDF.write.parquet("dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/data/test_table/key=1")


In [None]:
data2 = [
    (2, 8),
    (3, 27),
    (4, 64)
]

cubesDF = my_spark.createDataFrame(data2, ["Number", "Cub"])
cubesDF.show()

+------+---+
|Number|Cub|
+------+---+
|     2|  8|
|     3| 27|
|     4| 64|
+------+---+



In [None]:
cubesDF.write.parquet("dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/data/test_table/key=2")


In [None]:
# Read the partitioned table
mergedDF = spark.read.option("mergeSchema", "true").parquet("/FileStore/shared_uploads/forgcpmak@gmail.com/data/test_table")
mergedDF.printSchema()

root
 |-- Number: long (nullable = true)
 |-- Sqr: long (nullable = true)
 |-- Cub: long (nullable = true)
 |-- key: integer (nullable = true)



In [None]:
mergedDF.show()

+------+----+----+---+
|Number| Sqr| Cub|key|
+------+----+----+---+
|     2|null|   8|  2|
|     3|null|  27|  2|
|     4|null|  64|  2|
|     2|   4|null|  1|
|     3|   9|null|  1|
|     4|  16|null|  1|
+------+----+----+---+



In [None]:
file_location = "dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/sales_info.csv"
file_type = "csv"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","

sales_df = spark.read \
        .format(file_type) \
        .option("inferSchema", infer_schema) \
        .option("header", first_row_is_header) \
        .option("sep", delimiter) \
        .load(file_location)

sales_df.printSchema()
sales_df.show()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [None]:
sales_df\
     .coalesce(1) \
     .write\
     .mode("overwrite")\
     .save("dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/sales_info.parquet",format="parquet")



In [None]:
sales_df.schema.jsonValue()

Out[44]: {'type': 'struct',
 'fields': [{'name': 'Company',
   'type': 'string',
   'nullable': True,
   'metadata': {}},
  {'name': 'Person', 'type': 'string', 'nullable': True, 'metadata': {}},
  {'name': 'Sales', 'type': 'double', 'nullable': True, 'metadata': {}}]}

In [None]:
file_location = "dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/sales_info.parquet"
file_type = "parquet"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
#delimiter = ","

sales_df_pq = spark.read \
        .format(file_type) \
        .option("inferSchema", infer_schema) \
        .option("header", first_row_is_header) \
        .load(file_location)

sales_df_pq.printSchema()
sales_df_pq.show()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [None]:
sales_df_pq.printSchema()
sales_df_pq.show()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [None]:
stack_overflow_2016_survey_df = spark.read\
                           .format("csv") \
                           .option("sep",",") \
                           .option("header", "true") \
                           .option("inferSchema","true") \
                           .load("dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/2016_stack_overflow_survey_responses.csv")



In [None]:
stack_overflow_2016_survey_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- collector: string (nullable = true)
 |-- country: string (nullable = true)
 |-- un_subregion: string (nullable = true)
 |-- so_region: string (nullable = true)
 |-- age_range: string (nullable = true)
 |-- age_midpoint: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- self_identification: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- occupation_group: string (nullable = true)
 |-- experience_range: string (nullable = true)
 |-- experience_midpoint: double (nullable = true)
 |-- salary_range: string (nullable = true)
 |-- salary_midpoint: double (nullable = true)
 |-- big_mac_index: double (nullable = true)
 |-- tech_do: string (nullable = true)
 |-- tech_want: string (nullable = true)
 |-- aliens: string (nullable = true)
 |-- programming_ability: double (nullable = true)
 |-- employment_status: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- company_size_range: string (null

In [None]:
uk_post_code_df = spark.read\
                           .format("csv") \
                           .option("sep",",") \
                           .option("header", "true") \
                           .load("dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/uk_postcode.csv")

In [None]:
uk_post_code_df.show()

+--------+--------+---------+-------+--------+--------+--------------------+-------------+---------+----------------+----------+----------+
|Postcode|Latitude|Longitude|Easting|Northing| GridRef|           Town/Area|       Region|Postcodes|Active postcodes|Population|Households|
+--------+--------+---------+-------+--------+--------+--------------------+-------------+---------+----------------+----------+----------+
|     AB1| 57.1269| -2.13644| 391839|  804005|NJ918040|            Aberdeen|     Aberdeen|     2655|               0|      null|      null|
|     AB2| 57.1713| -2.14152| 391541|  808948|NJ915089|            Aberdeen|     Aberdeen|     3070|               0|      null|      null|
|     AB3| 57.0876| -2.59624| 363963|  799780|NO639997|            Aberdeen|     Aberdeen|     2168|               0|      null|      null|
|     AB4| 57.5343| -2.12713| 392487|  849358|NJ924493|Fraserburgh, Pete...|     Aberdeen|     2956|               0|      null|      null|
|     AB5| 57.4652| 

In [None]:
df_nasa_19950801 = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load("dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/nasa_19950801.tsv")

df2_nasa_19950701 = spark.read.format("csv").option("delimiter", "\t").option("header", "true").load("dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/nasa_19950701.tsv")

In [None]:
df_nasa_19950801.count()

Out[4]: 9999

In [None]:
df_nasa_19950801.show(20, truncate = False)

+---------------------------+-------+---------+------+---------------------------------------------------+--------+-----+
|host                       |logname|time     |method|url                                                |response|bytes|
+---------------------------+-------+---------+------+---------------------------------------------------+--------+-----+
|in24.inetnebr.com          |-      |807249601|GET   |/shuttle/missions/sts-68/news/sts-68-mcc-05.txt    |200     |1839 |
|uplherc.upl.com            |-      |807249607|GET   |/                                                  |304     |0    |
|uplherc.upl.com            |-      |807249608|GET   |/images/ksclogo-medium.gif                         |304     |0    |
|uplherc.upl.com            |-      |807249608|GET   |/images/MOSAIC-logosmall.gif                       |304     |0    |
|uplherc.upl.com            |-      |807249608|GET   |/images/USA-logosmall.gif                          |304     |0    |
|ix-esc-ca2-07.ix.netcom

In [None]:
df_nasa_19950801.\
   select ('host','bytes')\
   .groupBy('host') \
   .agg(
      sum(col('bytes')).alias('sumofBytes')
      ) \
   .orderBy(col('sumofBytes').desc()) \
   .show(truncate = False)
   

+--------------------------+-----------+
|host                      |sumofBytes |
+--------------------------+-----------+
|www-relay.pa-x.dec.com    |1.1675006E7|
|www.thyssen.com           |4056290.0  |
|130.110.74.81             |3286411.0  |
|ircgate1.rcc-irc.si       |3181377.0  |
|piweba3y.prodigy.com      |2852560.0  |
|seigate.sumiden.co.jp     |2771498.0  |
|pc121102.shef.ac.uk       |2602173.0  |
|uplherc.upl.com           |2540660.0  |
|box.dcs.warwick.ac.uk     |2346191.0  |
|www-c1.proxy.aol.com      |2198197.0  |
|193.246.121.210           |2163688.0  |
|haraway.ucet.ufl.edu      |2146536.0  |
|s150.phxslip4.indirect.com|2067283.0  |
|bettong.client.uq.oz.au   |1936436.0  |
|adam.tower.com.au         |1864419.0  |
|139.137.217.23            |1828056.0  |
|torben.dou.dk             |1706824.0  |
|ncg-72.axionet.com        |1596447.0  |
|ppp3.mtx.net.au           |1501018.0  |
|ccn.cs.dal.ca             |1492908.0  |
+--------------------------+-----------+
only showing top

In [None]:
df_nasa_19950801.\
   select ('host','response')\
   .filter(df_nasa_19950801['response'].cast('int') != 200 ) \
   .groupBy('host') \
   .agg(
      count(col('response')).alias('countofNonSuccess')
      ) \
   .orderBy(col('countofNonSuccess').desc()) \
   .show(truncate = False)

+--------------------------+-----------------+
|host                      |countofNonSuccess|
+--------------------------+-----------------+
|ts8-1.westwood.ts.ucla.edu|37               |
|slmel1p63.ozemail.com.au  |35               |
|mfm-stich4pc.amc.uva.nl   |25               |
|dialup21.brussels.eunet.be|24               |
|ix-pl1-08.ix.netcom.com   |20               |
|gert.tbit.dk              |15               |
|mage.ho.bom.gov.au        |14               |
|ssc24.iscs.nus.sg         |14               |
|funny.dcs.warwick.ac.uk   |14               |
|ts01-ind-21.iquest.net    |13               |
|rpgopher.aist.go.jp       |13               |
|slip167.slip.uleth.ca     |13               |
|bora.dacom.co.kr          |13               |
|163.205.156.16            |12               |
|ottgate2.bnr.ca           |12               |
|nts137.dialup.hawaii.edu  |11               |
|h96-158.ccnet.com         |11               |
|nit1.mains.nitech.ac.jp   |10               |
|133.68.18.18

In [None]:
df_real_estate = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/forgcpmak@gmail.com/RealEstate.csv")

In [None]:
df_real_estate \
   .withColumn('PriceInt',col("Price").cast('double')) \
   .orderBy(col('PriceInt').desc())\
   .show()

+------+----------------+----------+--------+---------+----+-----------+-----------+---------+
|   MLS|        Location|     Price|Bedrooms|Bathrooms|Size|Price SQ Ft|     Status| PriceInt|
+------+----------------+----------+--------+---------+----+-----------+-----------+---------+
|154526|   Arroyo Grande|5499000.00|       4|        5|5060|    1086.76|    Regular|5499000.0|
|154491|         Cambria|2995000.00|       5|        4|3684|     812.98|    Regular|2995000.0|
|154463| San Luis Obispo|2369000.00|       5|        6|4174|     567.56|    Regular|2369000.0|
|154434|         Cambria|2000000.00|       4|        4|3576|     559.28|    Regular|2000000.0|
|152768|     Avila Beach|1999000.00|       4|        5|5307|     376.67| Short Sale|1999000.0|
|150439|   Arroyo Grande|1900000.00|       4|        5|5411|     351.14| Short Sale|1900000.0|
|151419|     Pismo Beach|1799000.00|       4|        4|3609|     498.48|Foreclosure|1799000.0|
|150949|          Nipomo|1700000.00|       3|     

In [None]:
df_real_estate \
   .select('Location') \
   .distinct() \
   .orderBy('Location') \
   .show()

+-------------------+
|           Location|
+-------------------+
|      Arroyo Grande|
|         Atascadero|
|            Bradley|
|            Cambria|
|            Cayucos|
|            Creston|
|       Grover Beach|
|             Lompoc|
|           Los Osos|
|          Morro Bay|
|             Nipomo|
|             Oceano|
|        Out Of Area|
|        Paso Robles|
|        Pismo Beach|
|    San Luis Obispo|
|         San Miguel|
| Santa Maria-Orcutt|
|            Solvang|
|          Templeton|
+-------------------+
only showing top 20 rows



In [None]:
df_real_estate_intprice = df_real_estate \
    .withColumn('PriceInt',col("Price").cast('double')) \
    .withColumnRenamed('Price SQ Ft','PricePerSqFt') \
    .withColumn('RatePerSqFtDouble',col('PricePerSqFt').cast('double')) \
    .drop('Price','PricePerSqFt')

In [None]:
df_real_estate_intprice.printSchema()

root
 |-- MLS: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Bedrooms: string (nullable = true)
 |-- Bathrooms: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- PriceInt: double (nullable = true)
 |-- RatePerSqFtDouble: double (nullable = true)



In [None]:
#Step #1 - Import the Window 
from pyspark.sql.window import Window
#Stpe #2 - Import the rank function
from pyspark.sql.functions import rank

windowLocationPriceSpec  = Window \
                .partitionBy(col('Location'))\
                .orderBy(col('PriceInt').desc())

df_real_estate_intprice \
   .withColumn('rank',rank().over(windowLocationPriceSpec)) \
   .filter(col('rank') == 1) \
   .orderBy(col('PriceInt').desc()) \
   .select('Location','PriceInt') \
   .show()

+----------------+---------+
|        Location| PriceInt|
+----------------+---------+
|   Arroyo Grande|5499000.0|
|         Cambria|2995000.0|
| San Luis Obispo|2369000.0|
|     Avila Beach|1999000.0|
|   Arroyo Grande|1900000.0|
|     Pismo Beach|1799000.0|
|          Nipomo|1700000.0|
|         Bradley|1600000.0|
|         Cayucos|1500000.0|
|       Templeton|1399000.0|
|      Santa Ynez|1395000.0|
|        Los Osos|1350000.0|
|          Oceano|1250000.0|
|        Los Osos|1249000.0|
|          Oceano|1195000.0|
|     Out Of Area|1195000.0|
|       Morro Bay|1100000.0|
|          Nipomo|1065000.0|
|    Grover Beach| 999000.0|
|      Atascadero| 995000.0|
+----------------+---------+
only showing top 20 rows



In [None]:
windowLocationPricePersqFtSpec  = Window \
                .partitionBy(col('Location'))\
                .orderBy(col('RatePerSqFtDouble').desc())

df_real_estate_intprice \
   .withColumn('rank',rank().over(windowLocationPricePersqFtSpec)) \
   .filter(col('rank') == 1) \
   .orderBy(col('RatePerSqFtDouble').desc()) \
   .select('Location','RatePerSqFtDouble') \
   .show()

+----------------+-----------------+
|        Location|RatePerSqFtDouble|
+----------------+-----------------+
|          Oceano|          1144.64|
|   Arroyo Grande|          1086.76|
|     Pismo Beach|            819.4|
|         Cambria|           812.98|
|     Avila Beach|           686.02|
|         Cambria|           680.58|
|     Pismo Beach|           660.95|
|         Bradley|           606.06|
| San Luis Obispo|           567.56|
|      Atascadero|            562.5|
|        Los Osos|           539.57|
|      Santa Ynez|           518.01|
|       Morro Bay|           499.35|
|         Cayucos|           483.65|
|          Oceano|           483.37|
|    Grover Beach|            468.2|
| San Luis Obispo|           461.36|
|       Morro Bay|            456.9|
|     Paso Robles|           439.67|
|          Nipomo|           411.18|
+----------------+-----------------+
only showing top 20 rows

