### SQL query with Spark

In [1]:
!pip install -q findspark
!pip install -q pyspark

[K     |████████████████████████████████| 281.4 MB 33 kB/s 
[K     |████████████████████████████████| 198 kB 52.8 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import os
import sqlite3
import findspark
from pyspark.sql.functions import lit
from pyspark.sql.functions import col
from pyspark.sql.functions import date_format
from pyspark.sql.functions import to_date
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import DoubleType
from pyspark.sql.types import LongType
from pyspark.sql import SparkSession


In [3]:
findspark.init()
spark = SparkSession.builder\
                    .master("local")\
                    .appName("colab")\
                    .config("spark.ui.port", "4050")\
                    .getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
conn = sqlite3.connect('/content/gdrive/MyDrive/ColabDataset/chinook.db')

In [6]:
conn

<sqlite3.Connection at 0x7ff6dfb23030>

In [7]:
def select_all_tasks(conn, query):
    """
    Query all rows in the tasks table
    :param conn: the Connection object
    :return:
    """
    cur = conn.cursor()
    cur.execute(query)

    rows = cur.fetchall()
    return rows

In [29]:
for column in [column for column in invoiceDf.columns
               if column not in artistDf.columns]:
    artistDf = artistDf.withColumn(column, lit(None))

In [33]:
for column in [column for column in artistDf.columns
               if column not in invoiceDf.columns]:
    invoiceDf = invoiceDf.withColumn(column, lit(None))

In [76]:
query = "SELECT c.FirstName,\
                          c.LastName,\
                          c.Company,\
                          c.SupportRepId,\
                          c.City,\
                          c.state,\
                          c.Country,\
                          c.PostalCode,\
                          i.InvoiceId,\
                          i.InvoiceDate,\
                          i.BillingAddress,\
                          i.BillingCity,\
                          i.BillingState,\
                          i.BillingCountry,\
                          i.BillingPostalCode,\
                          i.Total,\
                          t.UnitPrice,\
                          t.Quantity,\
                          r.Name,\
                          r.Composer,\
                          r.Milliseconds,\
                          r.Bytes,\
                          a.Title,\
                          s.Name AS ArtistName,\
                          p.Name AS Playlist,\
                          m.Name AS MediaType\
                  FROM customers AS c\
                  LEFT JOIN invoices AS i ON c.CustomerId==i.CustomerId\
                  LEFT JOIN invoice_items AS t ON i.InvoiceId==t.InvoiceId \
                  LEFT JOIN tracks AS r ON t.TrackId==r.TrackId \
                  LEFT JOIN albums AS a ON a.AlbumId==r.AlbumId \
                  LEFT JOIN artists AS s ON a.ArtistId==s.ArtistId \
                  LEFT JOIN genres AS g ON g.GenreId==r.GenreId \
                  LEFT JOIN playlist_track AS k ON k.TrackId==r.TrackId \
                  LEFT JOIN playlists AS p ON p.PlaylistId==k.PlaylistId \
                  LEFT JOIN media_types AS m ON m.MediaTypeId==r.MediaTypeId \
                  ORDER BY i.InvoiceId ASC ;"

In [77]:
cols_name = [
              ('FirstName', StringType()),
              ('LastName', StringType()),
              ('Company', StringType()),
              ('SupportRepId', IntegerType()),
              ('City', StringType()),
              ('State', StringType()),
              ('Country', StringType()),
              ('PostalCode', StringType()),
              ('InvoiceId', IntegerType()),
              ('InvoiceDate', StringType()),
              ('BillingAddress', StringType()),
              ('BillingCity', StringType()),
              ('BillingState', StringType()),
              ('BillingCountry', StringType()),
              ('BillingPostalCode', StringType()),
              ('Total', DoubleType()),
              ('UnitPrice', DoubleType()),
              ('Quantity', IntegerType()),
              ('Name', StringType()),
              ('Composer', StringType()),
              ('Milliseconds', LongType()),
              ('Bytes', LongType()),
              ('Title', StringType()),
              ('ArtistName', StringType()),
              ('Playlist', StringType()),
              ('MediaType', StringType()),
              ]

In [78]:
schema = StructType(
    [
     StructField(
        x[0], x[1], True
      )
     for x in cols_name
    ]
)

In [79]:
customerBehavior = select_all_tasks(conn, query)

In [80]:
customerBehaviorDf = spark.createDataFrame(customerBehavior, schema)

In [81]:
customerBehaviorDf.show(1, truncate=False, vertical=True)

-RECORD 0-------------------------------------
 FirstName         | Leonie                   
 LastName          | Köhler                   
 Company           | null                     
 SupportRepId      | 5                        
 City              | Stuttgart                
 State             | null                     
 Country           | Germany                  
 PostalCode        | 70174                    
 InvoiceId         | 1                        
 InvoiceDate       | 2009-01-01 00:00:00      
 BillingAddress    | Theodor-Heuss-Straße 34  
 BillingCity       | Stuttgart                
 BillingState      | null                     
 BillingCountry    | Germany                  
 BillingPostalCode | 70174                    
 Total             | 1.98                     
 UnitPrice         | 0.99                     
 Quantity          | 1                        
 Name              | Balls to the Wall        
 Composer          | null                     
 Milliseconds

In [84]:
customerBehaviorDf.filter("Composer IS NOT NULL").show()

+---------+--------+-------+------------+---------+-----+-------+----------+---------+-------------------+--------------------+-----------+------------+--------------+-----------------+-----+---------+--------+--------------------+--------------------+------------+--------+--------------------+----------+-------------------+--------------------+
|FirstName|LastName|Company|SupportRepId|     City|State|Country|PostalCode|InvoiceId|        InvoiceDate|      BillingAddress|BillingCity|BillingState|BillingCountry|BillingPostalCode|Total|UnitPrice|Quantity|                Name|            Composer|Milliseconds|   Bytes|               Title|ArtistName|           Playlist|           MediaType|
+---------+--------+-------+------------+---------+-----+-------+----------+---------+-------------------+--------------------+-----------+------------+--------------+-----------------+-----+---------+--------+--------------------+--------------------+------------+--------+--------------------+---------

In [85]:
customerBehaviorDf.count()

5572