In [1]:
import pyspark
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext()
sqlcontext = SQLContext(sc)

In [3]:
spark = SparkSession.builder.appName('abc').getOrCreate()

In [4]:
aisles = spark.read.csv("aisles.csv",header=True)
order_products_train = spark.read.csv("order_products__train.csv",header=True)
order_products_prior = spark.read.csv("order_products__prior.csv",header=True)
orders = spark.read.csv("orders.csv",header=True)
products = spark.read.csv("products.csv",header=True)
departments = spark.read.csv("departments.csv",header=True)
sample_submission = spark.read.csv("sample_submission.csv",header=True)

In [5]:
aisles.createOrReplaceTempView('aisles')
order_products_train.createOrReplaceTempView('order_products__train')
order_products_prior.createOrReplaceTempView('order_products__prior')
orders.createOrReplaceTempView('orders')
products.createOrReplaceTempView('products')
departments.createOrReplaceTempView('departments')
sample_submission.createOrReplaceTempView('sample_submission')

In [6]:
aisles_sql = sqlcontext.sql("Select * from aisles limit 10")
order_products_train_sql = sqlcontext.sql("Select * from order_products__train limit 10")
order_products_prior_sql = sqlcontext.sql("Select * from order_products__prior limit 10")
orders_sql = sqlcontext.sql("Select * from orders limit 10")
products_sql = sqlcontext.sql("Select * from products limit 10")
departments_sql = sqlcontext.sql("Select * from departments limit 10")
sample_submission_sql = sqlcontext.sql("Select * from sample_submission limit 10")

In [7]:
sample_submission_sql.show()

+--------+-----------+
|order_id|   products|
+--------+-----------+
|      17|39276 29259|
|      34|39276 29259|
|     137|39276 29259|
|     182|39276 29259|
|     257|39276 29259|
|     313|39276 29259|
|     353|39276 29259|
|     386|39276 29259|
|     414|39276 29259|
|     418|39276 29259|
+--------+-----------+



In [8]:
merged_sql_4 = sqlcontext.sql("SELECT COUNT(p.product_id) as counter,p.product_name,p.product_id from products as p JOIN order_products__train as t ON t.product_id = p.product_id group by p.product_id,p.product_name order by counter desc").show()

+-------+--------------------+----------+
|counter|        product_name|product_id|
+-------+--------------------+----------+
|  18726|              Banana|     24852|
|  15480|Bag of Organic Ba...|     13176|
|  10894|Organic Strawberries|     21137|
|   9784|Organic Baby Spinach|     21903|
|   8135|         Large Lemon|     47626|
|   7409|     Organic Avocado|     47766|
|   7293|Organic Hass Avocado|     47209|
|   6494|        Strawberries|     16797|
|   6033|               Limes|     26209|
|   5546| Organic Raspberries|     27966|
|   4966| Organic Blueberries|     39275|
|   4908|  Organic Whole Milk|     27845|
|   4613|    Organic Cucumber|     30391|
|   4589|    Organic Zucchini|     45007|
|   4290|Organic Yellow Onion|     22935|
|   4158|      Organic Garlic|     24964|
|   4059| Seedless Red Grapes|      4920|
|   3868|           Asparagus|     46979|
|   3823|Organic Grape Tom...|     40706|
|   3818|   Organic Red Onion|      8518|
+-------+--------------------+----

In [20]:
merged_sql_4_prior = sqlcontext.sql("SELECT COUNT(p.product_id) as counter,p.product_name,p.product_id as product_id from products as p JOIN order_products__prior as r on r.product_id = p.product_id group by p.product_id,p.product_name order by counter desc").show()

+-------+--------------------+----------+
|counter|        product_name|product_id|
+-------+--------------------+----------+
| 472565|              Banana|     24852|
| 379450|Bag of Organic Ba...|     13176|
| 264683|Organic Strawberries|     21137|
| 241921|Organic Baby Spinach|     21903|
| 213584|Organic Hass Avocado|     47209|
| 176815|     Organic Avocado|     47766|
| 152657|         Large Lemon|     47626|
| 142951|        Strawberries|     16797|
| 140627|               Limes|     26209|
| 137905|  Organic Whole Milk|     27845|
| 137057| Organic Raspberries|     27966|
| 113426|Organic Yellow Onion|     22935|
| 109778|      Organic Garlic|     24964|
| 104823|    Organic Zucchini|     45007|
| 100060| Organic Blueberries|     39275|
|  97315|      Cucumber Kirby|     49683|
|  89632|  Organic Fuji Apple|     28204|
|  87746|       Organic Lemon|      5876|
|  85020|Apple Honeycrisp ...|      8277|
|  84255|Organic Grape Tom...|     40706|
+-------+--------------------+----

In [25]:
merged_sql_5 = sqlcontext.sql("SELECT p.product_name,count(p.product_id) as count,t.reordered from products as p JOIN order_products__train as t ON t.product_id = p.product_id where t.reordered != 0 group by p.product_id,p.product_name,t.reordered order by count desc").show()

+--------------------+-----+---------+
|        product_name|count|reordered|
+--------------------+-----+---------+
|              Banana|16557|        1|
|Bag of Organic Ba...|13362|        1|
|Organic Strawberries| 8603|        1|
|Organic Baby Spinach| 8055|        1|
|     Organic Avocado| 6226|        1|
|Organic Hass Avocado| 6042|        1|
|         Large Lemon| 5923|        1|
|        Strawberries| 4786|        1|
| Organic Raspberries| 4279|        1|
|               Limes| 4234|        1|
|  Organic Whole Milk| 4174|        1|
| Organic Blueberries| 3384|        1|
|    Organic Zucchini| 3266|        1|
|Organic Yellow Onion| 3094|        1|
|      Organic Garlic| 3041|        1|
|    Organic Cucumber| 3007|        1|
| Seedless Red Grapes| 2869|        1|
|Organic Grape Tom...| 2727|        1|
|Sparkling Water G...| 2662|        1|
|    Honeycrisp Apple| 2655|        1|
+--------------------+-----+---------+
only showing top 20 rows



In [8]:
merged_sql_5_prior = sqlcontext.sql("SELECT p.product_name,count(p.product_id) as count,r.reordered from products as p JOIN order_products__prior as r on r.product_id = p.product_id where r.reordered != 0 group by p.product_id,p.product_name,r.reordered order by count desc").show()

+--------------------+------+---------+
|        product_name| count|reordered|
+--------------------+------+---------+
|              Banana|398609|        1|
|Bag of Organic Ba...|315913|        1|
|Organic Strawberries|205845|        1|
|Organic Baby Spinach|186884|        1|
|Organic Hass Avocado|170131|        1|
|     Organic Avocado|134044|        1|
|  Organic Whole Milk|114510|        1|
|         Large Lemon|106255|        1|
| Organic Raspberries|105409|        1|
|        Strawberries| 99802|        1|
|               Limes| 95768|        1|
|Organic Yellow Onion| 79072|        1|
|      Organic Garlic| 74663|        1|
|    Organic Zucchini| 72165|        1|
|      Cucumber Kirby| 67313|        1|
|  Organic Fuji Apple| 63811|        1|
| Organic Blueberries| 62922|        1|
|Apple Honeycrisp ...| 62510|        1|
|       Organic Lemon| 60536|        1|
| Organic Half & Half| 59672|        1|
+--------------------+------+---------+
only showing top 20 rows



In [17]:
merged_sql_9 = sqlcontext.sql("SELECT o.order_id,o.order_hour_of_day,p.product_name,p.product_id from products as p JOIN order_products__train as t ON t.product_id = p.product_id JOIN orders o on o.order_id = t.order_id where o.order_hour_of_day >=6 and o.order_hour_of_day <= 11").show()

+--------+-----------------+--------------------+----------+
|order_id|order_hour_of_day|        product_name|product_id|
+--------+-----------------+--------------------+----------+
| 1015263|               10|Bag of Organic Ba...|     13176|
| 1015263|               10|Organic Hass Avocado|     47209|
| 1015263|               10|Organic Whole Mil...|     38544|
| 1015263|               10|  Organic Whole Milk|     27845|
| 1015263|               10|  Organic Fuji Apple|     28204|
| 1015263|               10|Sweet Potato Littles|     27573|
| 1015263|               10|Freeze Dried Mang...|      7854|
| 1015263|               10|Gluten Free Spina...|     48182|
| 1015263|               10|Total 2% All Natu...|     30169|
| 1015263|               10|Hot Kid Toddler M...|     39442|
| 1015263|               10| Multi Grain Waffles|     42404|
| 1015263|               10|         Blueberries|      9076|
| 1015263|               10| Honey Bunny Grahams|     14867|
| 1015263|              

In [None]:
merged_sql_9_prior = sqlcontext.sql("SELECT p.product_name,p.product_id from products as p JOIN order_products__prior as r ON r.product_id = p.product_id JOIN orders o on o.order_id = r.order_id where o.order_hour_of_day >=6 and o.order_hour_of_day <= 11").show()