# Data Challenge - 03.08.23

In [18]:
# Set up
import sqlite3
import pandas as pd
import seaborn as sns
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np
import plotly.express as px

In [2]:
# Connect to database
conn = sqlite3.connect("data/challenge_db.db")

In [76]:
#Function for first Task
def task_one(val, time):
    if val == "total":
        func = "SUM"
    elif val == "average":
        func = "AVG"
    if time == "week":
        dt_format = "%W"
        col_name = "week"
    elif time == "month":
        dt_format = "%m"
        col_name = "month"
    
    df = pd.read_sql(f"""SELECT {func}(totalAmount) AS '{val}', \
    strftime('{dt_format}', payments.paymentCreatedAt) AS {col_name} \
FROM payments \
WHERE NOT payments.status = 'ERR'\
GROUP BY {col_name}""", con = conn)
    b_plot = sns.barplot(y = df[val], x = df[time])
    b_plot.set_title(f"Payment {val} by {time}\n", fontdict={"size": 16})
    if time == "month":
        b_plot.set_xticks(range(df[time].nunique()), labels=["April", "May", "June", "July"])
    return b_plot

### First Task - Display mean/sum of payments by week/month

In [77]:
interact(task_one, val=["total", "average"], time = ["week", "month"])

interactive(children=(Dropdown(description='val', options=('total', 'average'), value='total'), Dropdown(descr…

<function __main__.task_one(val, time)>

Set up for second task:

In [20]:
#top 123 cities of Germany from de.csv
big_cities = pd.read_csv("data/de.csv")

In [21]:
transactions_per_city = pd.read_sql("""
SELECT COUNT(totalAmount) AS num_trans, city
FROM payments JOIN locations ON payments.locationId = locations.uuid
WHERE NOT payments.status = "ERR"
GROUP BY city
""", con = conn)

In [22]:
trans_big_cities = transactions_per_city[transactions_per_city.city.isin(big_cities.city)]
trans_big_cities = trans_big_cities.merge(big_cities, on = "city")

In [25]:
fig = px.scatter_geo(trans_big_cities, lat='lat', lon='lng',
                     hover_name="city", hover_data = {"lat": False, "lng": False, "num_trans": True}, size="num_trans",
                     scope="europe", title='Number of transactions by (big) city',
                    center=dict(lat=51.0057, lon=13.7274)
                )

fig = fig.update_layout(
    autosize=True,
    height=600,
    geo=dict(
        center=dict(
            lat=51.0057,
            lon=13.7274
        ),
        scope='europe',
        projection_scale=6
    )
)

### Second Task - Payments by city

In [26]:
fig.show()

Set up third task..

In [27]:
# get data frame for plots
rank_two = pd.read_sql("""
WITH incl_rank AS (SELECT SUM(totalAmount) as total, AVG(totalAmount) as average, MAX(totalAmount), locationId, city,
RANK() OVER (
PARTITION BY city
ORDER BY city ASC, MAX(totalAmount) DESC
) AS order_rank

FROM payments JOIN locations ON payments.locationId=locations.uuid
WHERE NOT payments.status = "ERR"
GROUP BY city, locationId
ORDER BY city ASC, MAX(totalAmount) DESC)

SELECT *
FROM incl_rank
WHERE order_rank = 2
""", con = conn)

In [59]:
def plot_spec_city(city):
    plt = sns.barplot(y = pd.Series([rank_two.loc[rank_two.city == city,"average"].values[0], 
                                     rank_two.loc[rank_two.city == city,"total"].values[0]], 
                              index= [0, 1]), 
                x = pd.Series(["average", "total"]))
    plt.set_title(f"Average + total of location w/ second largest single payment in {city}\n", 
                  fontdict={"size": 16, "va":"top"})
    
    plt.bar_label(plt.containers[0])

### Third task - values for second largest location per city

In [60]:
interact(plot_spec_city, city = rank_two.city.values)

interactive(children=(Dropdown(description='city', options=('Berlin', 'Hamburg', 'Ot'), value='Berlin'), Outpu…

<function __main__.plot_spec_city(city)>

Set up Fourth task..

In [61]:
sales_by_hour = pd.read_sql("""
SELECT AVG(totalAmount) AS average, 
SUM(totalAmount) AS total, 
city, 
strftime("%H", paymentCreatedAt) AS hour
FROM payments JOIN locations ON payments.locationId=locations.uuid
WHERE NOT payments.status = "ERR"
GROUP BY city, hour
""", con = conn)

In [66]:
def by_hour(city):
    plt = sns.barplot(y = sales_by_hour.loc[sales_by_hour.city == city, "total"], 
                x = sales_by_hour.loc[sales_by_hour.city == city, "hour"])
    plt.set_title(f"Sales per hour in {city}\n", fontdict={"size": 16})

### Fourth task - sales per city by time of day

In [69]:
interact(by_hour, city= sales_by_hour.city.unique())

interactive(children=(Dropdown(description='city', options=('Berlin', 'Bielefeld', 'Burgwedel', 'Großbottwar',…

<function __main__.by_hour(city)>