# Meiro Data Analyst Engineer Practice Task

## Star Schema Data Model creation

In [1]:
# Importing the required libraries
import pandas as pd
import plotly.express as px

In [2]:
# Read the .ndjson file into a pandas DataFrame
df = pd.read_json(path_or_buf="./data/data.ndjson.gz", compression="gzip", lines=True)

# Updating specific column names and data types
df.rename(columns={"id": "order_id", "created": "order_created"}, inplace=True)
df["order_created"] = pd.to_datetime(df["order_created"], unit="s").dt.date

print(df.shape)
df.head()

(100000, 4)


Unnamed: 0,order_id,order_created,products,user
0,0,2018-11-16,"[{'id': 0, 'name': 'Product A', 'price': 160},...","{'id': 3, 'name': 'User D', 'city': 'Sydney'}"
1,1,2018-12-06,"[{'id': 3, 'name': 'Product D', 'price': 130},...","{'id': 0, 'name': 'User A', 'city': 'Prague'}"
2,2,2018-10-21,"[{'id': 6, 'name': 'Product G', 'price': 100},...","{'id': 0, 'name': 'User A', 'city': 'Prague'}"
3,3,2018-11-12,"[{'id': 6, 'name': 'Product G', 'price': 100},...","{'id': 2, 'name': 'User C', 'city': 'Singapore'}"
4,4,2018-11-30,"[{'id': 10, 'name': 'Product K', 'price': 60},...","{'id': 5, 'name': 'User F', 'city': 'Jakarta'}"


In [3]:
# Splitting the "user" information into Tabular format & updating column names
user_df = pd.json_normalize(df["user"])
user_df.columns = [f"user_{col}" for col in user_df.columns]

# TEST: Checking if the number of rows in the user_df matches with the original df
assert len(user_df) == len(df), "rows in user_df do not match with df"

# Joining the user columns with the original df and dropping the redundant column
df = df.join(user_df)
df.drop(columns=["user"], inplace=True)

print(df.shape)
df.head()

(100000, 6)


Unnamed: 0,order_id,order_created,products,user_id,user_name,user_city
0,0,2018-11-16,"[{'id': 0, 'name': 'Product A', 'price': 160},...",3,User D,Sydney
1,1,2018-12-06,"[{'id': 3, 'name': 'Product D', 'price': 130},...",0,User A,Prague
2,2,2018-10-21,"[{'id': 6, 'name': 'Product G', 'price': 100},...",0,User A,Prague
3,3,2018-11-12,"[{'id': 6, 'name': 'Product G', 'price': 100},...",2,User C,Singapore
4,4,2018-11-30,"[{'id': 10, 'name': 'Product K', 'price': 60},...",5,User F,Jakarta


In [4]:
# Transforming each product list item in the "products" column into a separate row
orders_df = pd.DataFrame(df["products"].explode())

# Storing the index of the original df for future joins
orders_index = orders_df.index.to_list()

print(orders_df.shape)
orders_df.head()

(549145, 1)


Unnamed: 0,products
0,"{'id': 0, 'name': 'Product A', 'price': 160}"
0,"{'id': 8, 'name': 'Product I', 'price': 80}"
1,"{'id': 3, 'name': 'Product D', 'price': 130}"
1,"{'id': 2, 'name': 'Product C', 'price': 140}"
1,"{'id': 13, 'name': 'Product N', 'price': 30}"


In [5]:
# Splitting the "products" information into Tabular format & updating column names
orders_df = pd.json_normalize(orders_df["products"])
orders_df.columns = [f"product_{col}" for col in orders_df.columns]

# Replacing index with the original index representative of the orders in the original df
orders_df["index"] = orders_index
orders_df.set_index("index", inplace=True)

print(orders_df.shape)
orders_df.head()

(549145, 3)


Unnamed: 0_level_0,product_id,product_name,product_price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,Product A,160
0,8,Product I,80
1,3,Product D,130
1,2,Product C,140
1,13,Product N,30


In [6]:
# Creating one master table with all the order, product, user information and dropping the redundant column
orders_df = orders_df.join(df.drop(columns=["products"])).reset_index(drop=True)

print(orders_df.shape)
orders_df.head()

(549145, 8)


Unnamed: 0,product_id,product_name,product_price,order_id,order_created,user_id,user_name,user_city
0,0,Product A,160,0,2018-11-16,3,User D,Sydney
1,8,Product I,80,0,2018-11-16,3,User D,Sydney
2,3,Product D,130,1,2018-12-06,0,User A,Prague
3,2,Product C,140,1,2018-12-06,0,User A,Prague
4,13,Product N,30,1,2018-12-06,0,User A,Prague


---
**Star schema data model**
- `dim_user`: A dimension table that stores information about users. The `user_id` is the primary key (PK), ensuring that each user is unique. Other attributes will include `user_name` and `user_city`.
- `dim_product`: A dimension table containing details about products. The `product_id` is the primary key (PK). Other attributes will include `product_name` and `product_price`. 
- `fact_order`: The fact table in which transactions are recorded. The `order_id` is the primary key (PK), and there are two foreign keys: `user_id` (FK1) linking to `dim_user` dimension table and `product_ids` (FK2) which links to `dim_product` dimension table. This table will include an attribute `order_created` that records the timestamp of the order, and measures such as `order_total` that records the total price per order, and `basket_size` that records the number of items per order.
  - `dim_user`<>`fact_order`: One-to-many relationship, where one user can have multiple orders.
  - `dim_product`<>`fact_order`: Many-to-many relationship, with a nested field `product_ids` holding list of all individual `product_id` associated with each order. A single product can appear in many orders and a single order can contain many products.

![Star schema data model diagram](./star_schema_datamodel.png)
---

In [7]:
# Creating first Dimensional table from master table with User information
dim_user = (
    orders_df.groupby(["user_id", "user_name", "user_city"])[["order_id"]]
    .count()
    .reset_index()
    .drop(columns=["order_id"])
)

# TEST: Checking if the users are unique
assert dim_user["user_id"].nunique() == len(dim_user), "user id is not primary key"

print(dim_user.shape)
dim_user.head()

(8, 3)


Unnamed: 0,user_id,user_name,user_city
0,0,User A,Prague
1,1,User B,Brno
2,2,User C,Singapore
3,3,User D,Sydney
4,4,User E,Melbourne


In [8]:
# Creating second Dimensional table from master table with Product information
dim_product = (
    orders_df.groupby(["product_id", "product_name", "product_price"])[["order_id"]]
    .count()
    .reset_index()
    .drop(columns=["order_id"])
)

# TEST: Checking if the products are unique
assert dim_product["product_id"].nunique() == len(
    dim_product
), "prod id is not primary key"

print(dim_product.shape)
dim_product.head()

(16, 3)


Unnamed: 0,product_id,product_name,product_price
0,0,Product A,160
1,1,Product B,150
2,2,Product C,140
3,3,Product D,130
4,4,Product E,120


In [9]:
# Creating Fact table from master table with Order information
fact_order = (
    orders_df.groupby(["order_id", "order_created", "user_id"])
    .agg({"product_id": list, "product_price": "sum"})
    .reset_index()
)

# Renaming columns to be more representative
fact_order.rename(
    columns={"product_id": "product_ids", "product_price": "order_total"}, inplace=True
)

# Creating a new column to store the basket size per order
fact_order["basket_size"] = fact_order["product_ids"].apply(len)

# TEST: Checking if the orders are unique
assert fact_order["order_id"].nunique() == len(
    fact_order
), "order id is not primary key"

print(fact_order.shape)
fact_order.head()

(100000, 6)


Unnamed: 0,order_id,order_created,user_id,product_ids,order_total,basket_size
0,0,2018-11-16,3,"[0, 8]",240,2
1,1,2018-12-06,0,"[3, 2, 13]",300,3
2,2,2018-10-21,0,"[6, 7]",190,2
3,3,2018-11-12,2,"[6, 13, 7, 14, 8, 0, 5, 13, 10]",680,9
4,4,2018-11-30,5,"[10, 3, 2, 12]",370,4


## Visualizations

### 1. Total Sales $ over Time 

In [10]:
# Creating a line plot to visualize the total sales over time excluding the first and last date with outlier values
date_grouped_df = fact_order.groupby("order_created")["order_total"].sum().reset_index()

fig1 = px.line(
    date_grouped_df.iloc[1:-1],
    x="order_created",
    y="order_total",
    title="Total Sales Over Time",
)
fig1.show(width=500)

### 2. Total Basket size over Time

In [11]:
# Creating a line plot to visualize the basket size over time excluding the first and last date with outlier values
date_grouped_basket_df = fact_order.groupby("order_created")["basket_size"].sum().reset_index()

fig2 = px.line(
    date_grouped_basket_df.iloc[1:-1],
    x="order_created",
    y="basket_size",
    title="Basket size Over Time",
)
fig2.show(width=500)

### 3. Total Sales $ by User

In [12]:
# Creating a bar plot to visualize the top 5 sales by User
user_grouped_df = (
    fact_order.groupby("user_id")["order_total"].sum().reset_index()
    .sort_values("order_total", ascending=False)
    .head()
)

fig3 = px.bar(
    user_grouped_df,
    y="user_id",
    x="order_total",
    title="Total Sales By User",
    orientation='h',
).update_yaxes(type='category', categoryorder='max ascending')
fig3.show(width=500)

### 4. Total Sales $ by City 

In [13]:
# Creating a pie plot to visualize the total sales by User city
user_city_sales = (
    fact_order.merge(dim_user, on="user_id")
    .groupby("user_city")["order_total"].sum().reset_index()
)

fig4 = px.pie(
    user_city_sales,
    values="order_total",
    names="user_city",
    title="Total Sales By City",
)
fig4.show(width=500)

## Preparing for Streamlit deployment

In [14]:
# Exporting the Dimensional and Fact tables to pkl files for Streamlit app
dim_user.to_pickle("./data/dim_user.pkl")
dim_product.to_pickle("./data/dim_product.pkl")
fact_order.to_pickle("./data/fact_order.pkl")