In [4]:
from sqlalchemy import create_engine
import pandas as pd
%matplotlib inline

In [23]:
#Define the connection string
cnxn_string =("postgresql+psycopg2://{username}:{pswd}"
              "@{host}:{port}/{database}")
#print(cnxn_string)

In [18]:
#Create DB engine
engine = create_engine(cnxn_string.format(
    username="postgres",
    pswd="**********",
    host="localhost",
    port=5432,
    database="sqlda"))

## This is where i'll start to query the database using postgresql, python's sqlalchemy engine object and pandas dataframe.
## Each case will be explained in the comments.


In [28]:
#Using joins to Analyze Sales Dealership:
#The head of sales at the company would like a list of all customers who bought a car.
#I'll create a query that will return all customer IDs, first names, last names and valid phone numbers of customers who purchased a car.

query="""SELECT c.customer_id, c.first_name, c.last_name, c.phone
FROM sales s 
INNER JOIN customers c ON s.customer_id=c.customer_id
INNER JOIN products p ON s.product_id=p.product_id
WHERE p.product_type='automobile' AND c.phone IS NOT NULL"""

query_2="""SELECT first_name, last_name, COALESCE(phone, 'NO PHONE') AS phone
            FROM customers
            ORDER BY 1;"""



In [27]:
result_1 = pd.read_sql_query(query,engine)
result_1.head()

Unnamed: 0,customer_id,first_name,last_name,phone
0,35824,Wyatan,Dickie,405-786-0858
1,13206,Stace,Tuison,810-769-8255
2,2958,Kirstyn,Draysay,208-534-6858
3,32636,Kile,Fishlee,937-207-1484
4,26730,Raina,Titterell,304-871-4445


In [29]:
result_2 = pd.read_sql_query(query_2,engine)
result_2.head()


Unnamed: 0,first_name,last_name,phone
0,Aaren,Whelpdale,607-761-2568
1,Aaren,Norrey,NO PHONE
2,Aaren,Sadat,504-559-3464
3,Aaren,Deeman,NO PHONE
4,Aaren,Lamlin,414-937-4628


In [30]:
#Generating an Elite Customer Party Guest List using UNION:
#A query that will make a list of the company's customers and company employees
#Who live in Los Angeles, CA. The guest list should contain the first and last name, and whether the guest is a customer or an employee.

query="""(
            SELECT first_name, last_name, 'Customer' AS guest_type
            FROM customers
            WHERE city='Los Angeles' AND state='CA'
            )
            UNION
            (
            SELECT first_name, last_name, 'Employee' as guest_type
            FROM salespeople s
            INNER JOIN dealerships d ON s.dealership_id=d.dealership_id
            WHERE d.city ='Los Angeles' AND d.state = 'CA'
            )"""


In [31]:
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,first_name,last_name,guest_type
0,Euell,MacWhirter,Customer
1,Martainn,Tordoff,Customer
2,Truman,Cutmore,Customer
3,Asher,Drogan,Customer
4,Kelley,Christley,Customer


In [32]:
#Using the CASE WHEN Function to Get Regional Lists:
#The aim is to create a query that will map various values in a column to other values.

query = """SELECT c.customer_id,
            CASE WHEN c.state IN ('MA','NH','VT','ME','CT','RI') THEN 'New England'
            WHEN c.state IN ('GA','FL','MS','AL','LA','KY','VA','NC','SC','TN','VI','WV','AR') THEN 'Southeast'
            ELSE 'Other' END AS region
            FROM customers c
            ORDER BY 1;"""


In [33]:
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,customer_id,region
0,1,Other
1,2,Other
2,3,Southeast
3,4,Southeast
4,5,Southeast


In [34]:
#Building a Sales Model Using SQL Techniques:
#Clean and prepare the data for analysis using SQL techniques.
#The data science team wants to build a new model to help predict which customers are the best prospects for remarketing.
#A new data scientist has joined their team and does not know the database well enough to pull a dataset for this new model.
#The responsibility has fallen to the data analyst to help the data scientist prepare and build a dataset to be used to train a model.

query = """ 
            SELECT c.*,
                   p.*,
                   COALESCE(s.dealership_id,-1) as dealership_id,
                   CASE WHEN (p.base_msrp - s.sales_amount > 500) THEN 1
                   ELSE 0 END AS high_savings
            FROM sales s 
            INNER JOIN customers c ON s.customer_id = c.customer_id
            INNER JOIN products p ON p.product_id = s.product_id
            LEFT JOIN dealerships d ON s.dealership_id = d.dealership_id;
"""



In [35]:
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,customer_id,title,first_name,last_name,suffix,email,gender,ip_address,phone,street_address,...,date_added,product_id,model,year,product_type,base_msrp,production_start_date,production_end_date,dealership_id,high_savings
0,1,,Arlena,Riveles,,ariveles0@stumbleupon.com,F,98.36.172.246,,,...,2017-04-23,7,Bat,2016,scooter,599.99,2016-10-10,NaT,-1.0,0
1,4,,Jessika,Nussen,,jnussen3@salon.com,F,159.165.138.166,615-824-2506,224 Village Circle,...,2017-09-03,12,Lemon Zester,2019,scooter,349.99,2019-02-04,NaT,11.0,0
2,5,,Lonnie,Rembaud,,lrembaud4@discovery.com,F,18.131.58.65,786-499-3431,38 Lindbergh Way,...,2014-03-06,3,Lemon,2013,scooter,499.99,2013-05-01,2018-12-28,-1.0,0
3,6,,Cortie,Locksley,,clocksley5@weather.com,M,140.194.59.82,,6537 Delladonna Drive,...,2013-03-31,3,Lemon,2013,scooter,499.99,2013-05-01,2018-12-28,-1.0,0
4,7,,Wood,Kennham,,wkennham6@sohu.com,M,191.190.135.172,407-552-6486,001 Onsgard Park,...,2011-08-25,6,Model Sigma,2015,automobile,65500.0,2015-04-15,2018-10-01,-1.0,0


In [36]:
#Using Aggregate Functions to Analyze Data:
#Analyze the price of a products using different aggregate functions. 
query = """
            SELECT product_type, MIN(base_msrp), MAX(base_msrp), AVG(base_msrp), STDDEV(base_msrp)
            FROM products
            GROUP BY product_type;
"""

In [37]:
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,product_type,min,max,avg,stddev
0,automobile,35000.0,115000.0,79250.0,30477.450681
1,scooter,349.99,799.99,578.561429,167.971086


In [38]:
#Calculate the Total Number of Customers + Gender in State Using GROUPING SETS:

query= """ 
            SELECT state, gender, COUNT(*)
            FROM customers
            GROUP BY GROUPING SETS (
                (state),
                (gender),
                (state,gender)
            )
            ORDER BY 1,2;
"""


In [39]:
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,state,gender,count
0,AK,F,101
1,AK,M,87
2,AK,,188
3,AL,F,433
4,AL,M,489


In [40]:
#Calculate the Median Price of the Products Table:
query = """
            SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY base_msrp) AS median
            FROM products;
"""

In [41]:
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,median
0,749.99


In [42]:
#Calculating and Displaying Data Using the HAVING Clause:
query = """
            SELECT state, COUNT(*)
            FROM customers
            GROUP BY state
            HAVING COUNT(*)>=1000
            ORDER BY state;
"""

In [43]:
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,state,count
0,CA,5038
1,CO,1042
2,DC,1447
3,FL,3748
4,GA,1251


In [44]:
#Check How Much Missing Data is in Customers Table:
query = """
            SELECT SUM(CASE WHEN state IS NOT NULL OR state IN ('') THEN 1 ELSE 0 END)::FLOAT/COUNT(*) AS missing_data
            FROM customers;
"""

In [45]:
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,missing_data
0,0.89066


In [47]:
#Analyzing Sales Data Using Aggregate Functions:
#1.Calculate the total number of unit sale the company has done.
query_1= """
            SELECT COUNT(*)
            FROM sales;
"""
#2.Calculate the total sales amount in ($) for each state
query_2= """
            SELECT c.state, SUM(s.sales_amount) AS sale_sum   
            FROM sales s
            INNER JOIN customers c ON s.customer_id=c.customer_id
            GROUP BY c.state
            ORDER BY 1;
"""
#3. Identify the top five best dealerships in terms of the most units sold (ignore internet sales)
query_3= """
            SELECT s.dealership_id, COUNT(*)
            FROM sales s
            WHERE s.channel='dealership'
            GROUP BY 1
            ORDER BY 1 DESC
            LIMIT 5;
"""
#4.Calculate the average sales amount for each channel, as seen in the sales table, and look at the average sales amount first by channel sales, then by product_id, then by both #together 
query_4= """
            SELECT s.channel, s.product_id,AVG(s.sales_amount) AS avg_sales_amount
            FROM sales s
            GROUP BY GROUPING SETS(
                (s.channel),
                (s.product_id),
                (s.channel,s.product_id)
            )
            ORDER BY 1,2;
"""


In [50]:
result_1 = pd.read_sql_query(query_1,engine)
result_2 = pd.read_sql_query(query_2,engine)
result_3 = pd.read_sql_query(query_3,engine)
result_4= pd.read_sql_query(query_4,engine)

In [51]:
result_1.head() 

Unnamed: 0,count
0,37711


In [52]:
result_2.head()

Unnamed: 0,state,sale_sum
0,AK,1124269.0
1,AL,4820334.0
2,AR,1487924.0
3,AZ,4109364.0
4,CA,27942720.0


In [53]:
result_3.head()

Unnamed: 0,dealership_id,count
0,20.0,316
1,19.0,834
2,18.0,1465
3,17.0,431
4,16.0,955


In [54]:
result_4.head()

Unnamed: 0,channel,product_id,avg_sales_amount
0,dealership,3.0,477.253738
1,dealership,4.0,109822.274882
2,dealership,5.0,664.330132
3,dealership,6.0,62563.376384
4,dealership,7.0,573.744147


## Window Functions

In [55]:
#Analyzing Customer Data Fill Rates over Time:
#To analyze a new feature that will encourage people to fill out the customer survery, the company would like a running total of how many users have filled in their street #address over time.
query="""
            SELECT customer_id, street_address, date_added::DATE,
                COUNT(CASE WHEN street_address IS NOT NULL THEN customer_id ELSE NULL END)
                    OVER(ORDER BY date_added::DATE) as total_customers_filled_street
            FROM customers
            ORDER BY date_added;
"""


In [56]:
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,customer_id,street_address,date_added,total_customers_filled_street
0,35683,1 Cordelia Crossing,2010-03-15,10
1,30046,13961 Steensland Trail,2010-03-15,10
2,17099,130 Marcy Crossing,2010-03-15,10
3,2625,0353 Iowa Road,2010-03-15,10
4,30555,294 Quincy Hill,2010-03-15,10


In [57]:
#Rank Order of Hiring:
#The company would like to promote salespeople at their regional dealerships to management and would like to consider tenure in their decision.
#The query will rank the order of users according to their hire date for each dealership.
query = """
            SELECT *, RANK() OVER (PARTITION BY dealership_id ORDER BY hire_date)
            FROM salespeople
            WHERE termination_date IS NULL;
"""
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,salesperson_id,dealership_id,title,first_name,last_name,suffix,username,gender,hire_date,termination_date,rank
0,65,1,,Dukie,Oxteby,,doxteby1s,Male,2015-01-24,,1
1,74,1,,Marcos,Spong,,mspong21,Male,2015-03-18,,2
2,60,1,,Eveleen,Mace,,emace1n,Female,2015-07-15,,3
3,87,1,,Quent,Wogden,,qwogden2e,Male,2015-08-17,,4
4,98,1,,Englebert,Loraine,,eloraine2p,Male,2016-01-23,,5


In [60]:
#Calculation the 7-day Rolling Average of Sales
query="""
            WITH daily_sales AS(
                SELECT sales_transaction_date::DATE,
                SUM(sales_amount) AS total_sales
                FROM sales
                GROUP BY 1),

                moving_average_calculation_7 AS(
                    SELECT sales_transaction_date, total_sales,
                    AVG(total_sales) OVER (ORDER BY sales_transaction_date ROWS BETWEEN 7 PRECEDING AND CURRENT ROW) AS sales_moving_average_7,
                    ROW_NUMBER() OVER (ORDER BY sales_transaction_date) AS row_number
                    FROM daily_sales
                    ORDER BY 1)
                    
            SELECT sales_transaction_date,
                CASE WHEN row_number>=7 THEN sales_moving_average_7 ELSE NULL END AS sales_moving_average_7
            FROM moving_average_calculation_7;
"""

result = pd.read_sql_query(query,engine)
result.head(10)

Unnamed: 0,sales_transaction_date,sales_moving_average_7
0,2010-03-10,
1,2010-03-12,
2,2010-03-15,
3,2010-03-17,
4,2010-03-18,
5,2010-03-19,
6,2010-03-21,394.275857
7,2010-03-23,394.990125
8,2010-03-24,399.99
9,2010-03-25,399.99


In [61]:
#Team Lunch Motivation:
#To help improve sales performance, the sales team has decided to buy lunch for all salespeople every time they beat the figure for the best daily total earnings achieved over #the last 30 days. This query will produce the total sales in $ for a given day and the target the salespeople have to beat for that day starting from Jan 1,2019:


query = """
            WITH daily_sales AS (
                SELECT sales_transaction_date::DATE, SUM(sales_amount) AS total_sales
                FROM sales
                GROUP BY 1
            ),
                sales_stats_30 AS (
                SELECT sales_transaction_date, total_sales, 
                MAX (total_sales) OVER (ORDER BY sales_transaction_date ROWS BETWEEN 30 PRECEDING AND 1 PRECEDING) AS max_sales_30
                FROM daily_sales
                ORDER BY 1
            )

            SELECT sales_transaction_date, ROUND(total_sales) AS daily_total, ROUND(max_sales_30) AS target
            FROM sales_stats_30
            WHERE sales_transaction_date >='2019-01-01';
"""

result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,sales_transaction_date,daily_total,target
0,2019-01-01,87695.0,316465.0
1,2019-01-02,76150.0,316465.0
2,2019-01-03,161270.0,316465.0
3,2019-01-04,193210.0,316465.0
4,2019-01-05,49470.0,316465.0


In [62]:
#Analyzing Sales Using Window Frames and Window Functions:
#1.Calculate the total sales amount for all individual months in 2018 using the SUM
query="""
            SELECT sales_transaction_date::DATE,
                SUM(sales_amount) as total_sales_amount
            FROM sales
            WHERE sales_transaction_date>='2018-01-01' AND sales_transaction_date<='2019-01-01'
            GROUP BY 1
            ORDER BY 1;
"""
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,sales_transaction_date,total_sales_amount
0,2018-01-01,123689.951
1,2018-01-02,183859.79
2,2018-01-03,40029.854
3,2018-01-04,187119.878
4,2018-01-05,186459.904


In [63]:
#2.Calculate the rolling 30-day average for the daily number of sales deals
query="""
            WITH daily_sales AS (
                SELECT sales_transaction_date::DATE, SUM(sales_amount) AS total_sales
                FROM sales
                GROUP BY 1),
                
                sales_stats_30 AS (
                SELECT sales_transaction_date, total_sales, 
                AVG (total_sales) OVER (ORDER BY sales_transaction_date ROWS BETWEEN 30 PRECEDING AND CURRENT ROW) AS moving_avg,
                ROW_NUMBER() OVER (ORDER BY sales_transaction_date) AS row_number
                FROM daily_sales
                ORDER BY 1)

            SELECT sales_transaction_date,
                CASE WHEN row_number>=30 THEN moving_avg ELSE NULL END AS deals_moving_avg_30
            FROM sales_stats_30
            WHERE sales_transaction_date>='2018-01-01' AND sales_transaction_date <='2019-01-01';

"""
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,sales_transaction_date,deals_moving_avg_30
0,2018-01-01,154913.881194
1,2018-01-02,158566.135484
2,2018-01-03,159543.555097
3,2018-01-04,156790.008903
4,2018-01-05,158597.430161


In [64]:
#3.Calculate which decile each dealership would be in compared to other dealerships based on the total sales amount
query="""
            WITH total_dealership_sales AS(
                SELECT dealership_id, SUM(sales_amount) AS total_sales_amount
                FROM sales
                WHERE sales_transaction_date>='2008-01-01' AND sales_transaction_date<='2019-01-01'
                    AND channel='dealership'
                GROUP BY 1
            )

            SELECT *, NTILE(10) OVER (ORDER BY total_sales_amount) AS decile
            FROM total_dealership_sales;
"""
result = pd.read_sql_query(query,engine)
result.head()

Unnamed: 0,dealership_id,total_sales_amount,decile
0,8.0,1421433.062,1
1,13.0,2087358.245,1
2,20.0,2181877.637,2
3,17.0,2601206.827,2
4,9.0,2627492.122,3
