In [1]:
import pandas as pd

In [2]:
from sqlalchemy import create_engine
import sqlite3

In [3]:
engine = create_engine('sqlite:///data/chinook.db')

In [4]:
type(engine)

sqlalchemy.engine.base.Engine

### Getting tracke time for each genre

In [5]:
tracks = None
try:
    tracks = pd.read_sql_table('tracks', con=engine)
except ValueError as e:
    print(e)

In [6]:
tracks.head(4)

Unnamed: 0,TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice
0,1,For Those About To Rock (We Salute You),1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99
1,2,Balls to the Wall,2,2,1,,342562,5510424,0.99
2,3,Fast As a Shark,3,2,1,"F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...",230619,3990994,0.99
3,4,Restless and Wild,3,2,1,"F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D...",252051,4331779,0.99


In [7]:
genres = pd.read_sql_table('genres', engine)
genres.head()

Unnamed: 0,GenreId,Name
0,1,Rock
1,2,Jazz
2,3,Metal
3,4,Alternative & Punk
4,5,Rock And Roll


In [8]:
genre_track = genres.merge(tracks[['GenreId', 'Milliseconds']], 
                           on='GenreId', how='left') \
                     .drop('GenreId', axis='columns')

In [9]:
genre_track.head(10)

Unnamed: 0,Name,Milliseconds
0,Rock,343719
1,Rock,342562
2,Rock,230619
3,Rock,252051
4,Rock,375418
5,Rock,205662
6,Rock,233926
7,Rock,210834
8,Rock,203102
9,Rock,263497


In [10]:
per_genre_time = genre_track.groupby('Name')['Milliseconds'].mean()

In [11]:
pd.to_timedelta(per_genre_time, unit='ms').dt.floor('s').sort_values()

Name
Rock And Roll        0 days 00:02:14
Opera                0 days 00:02:54
Hip Hop/Rap          0 days 00:02:58
Easy Listening       0 days 00:03:09
Bossa Nova           0 days 00:03:39
R&B/Soul             0 days 00:03:40
World                0 days 00:03:44
Pop                  0 days 00:03:49
Latin                0 days 00:03:52
Alternative & Punk   0 days 00:03:54
Soundtrack           0 days 00:04:04
Reggae               0 days 00:04:07
Alternative          0 days 00:04:24
Blues                0 days 00:04:30
Rock                 0 days 00:04:43
Jazz                 0 days 00:04:51
Classical            0 days 00:04:53
Heavy Metal          0 days 00:04:57
Electronica/Dance    0 days 00:05:02
Metal                0 days 00:05:09
Comedy               0 days 00:26:25
TV Shows             0 days 00:35:45
Drama                0 days 00:42:55
Science Fiction      0 days 00:43:45
Sci Fi & Fantasy     0 days 00:48:31
Name: Milliseconds, dtype: timedelta64[ns]

### Total amount spent by the customer

#### load the tables with required columns

In [12]:
cust = pd.read_sql_table('customers', 
                         engine, 
                         columns=['CustomerId','FirstName', 'LastName'])

In [13]:
invoice = pd.read_sql_table('invoices', 
                            engine, 
                            columns=['InvoiceId','CustomerId'])

In [14]:
ii = pd.read_sql_table('invoice_items',
                       engine, 
                        columns=['InvoiceId', 'UnitPrice', 'Quantity'])

#### merge based on relationship key

In [15]:
cust_inv = cust.merge(invoice, on='CustomerId') \
                   .merge(ii, on='InvoiceId')

In [16]:
cust_inv.head(10)

Unnamed: 0,CustomerId,FirstName,LastName,InvoiceId,UnitPrice,Quantity
0,1,Luís,Gonçalves,98,1.99,1
1,1,Luís,Gonçalves,98,1.99,1
2,1,Luís,Gonçalves,121,0.99,1
3,1,Luís,Gonçalves,121,0.99,1
4,1,Luís,Gonçalves,121,0.99,1
5,1,Luís,Gonçalves,121,0.99,1
6,1,Luís,Gonçalves,143,0.99,1
7,1,Luís,Gonçalves,143,0.99,1
8,1,Luís,Gonçalves,143,0.99,1
9,1,Luís,Gonçalves,143,0.99,1


In [17]:
total = cust_inv['Quantity'] * cust_inv['UnitPrice']

In [18]:
total.head(3)

0    1.99
1    1.99
2    0.99
dtype: float64

In [19]:
cols = ['CustomerId', 'FirstName', 'LastName']

In [20]:
cust_inv = cust_inv.assign(Total = total)

In [21]:
cust_inv.head(4)

Unnamed: 0,CustomerId,FirstName,LastName,InvoiceId,UnitPrice,Quantity,Total
0,1,Luís,Gonçalves,98,1.99,1,1.99
1,1,Luís,Gonçalves,98,1.99,1,1.99
2,1,Luís,Gonçalves,121,0.99,1,0.99
3,1,Luís,Gonçalves,121,0.99,1,0.99


#### using group by, get total cost for each customer

In [22]:
final_res = cust_inv.groupby(cols)['Total'] \
                    .sum() \
                    .sort_values(ascending=False)

In [23]:
final_res.head(10)

CustomerId  FirstName  LastName  
6           Helena     Holý          49.62
26          Richard    Cunningham    47.62
57          Luis       Rojas         46.62
45          Ladislav   Kovács        45.62
46          Hugh       O'Reilly      45.62
37          Fynn       Zimmermann    43.62
24          Frank      Ralston       43.62
28          Julia      Barnett       43.62
25          Victor     Stevens       42.62
7           Astrid     Gruber        42.62
Name: Total, dtype: float64

### using the SQL query to load the data frame

In [24]:
query_str = 'select * from tracks'

In [25]:
tracks_df = pd.read_sql_query(query_str, engine)

In [26]:
tracks_df.head(2)

Unnamed: 0,TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice
0,1,For Those About To Rock (We Salute You),1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99
1,2,Balls to the Wall,2,2,1,,342562,5510424,0.99
