In [1]:
import gdown
import pandas as pd

In [3]:
file_id = "1iFlv5PjnezdaCcTzWsjAX-Ck9kMCBMMK"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, "orders.csv", quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1iFlv5PjnezdaCcTzWsjAX-Ck9kMCBMMK
From (redirected): https://drive.google.com/uc?id=1iFlv5PjnezdaCcTzWsjAX-Ck9kMCBMMK&confirm=t&uuid=74ee6812-0335-4b33-9bbd-a421d9c61cf5
To: /content/orders.csv
100%|██████████| 377M/377M [00:05<00:00, 75.3MB/s]


'orders.csv'

In [4]:
file_id = "1o25JTcxDBEaigjCrdq_bzKb9BtzCZdy8"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, "orderline.csv", quiet=False)


Downloading...
From (original): https://drive.google.com/uc?id=1o25JTcxDBEaigjCrdq_bzKb9BtzCZdy8
From (redirected): https://drive.google.com/uc?id=1o25JTcxDBEaigjCrdq_bzKb9BtzCZdy8&confirm=t&uuid=56ee9631-eddd-4fb7-89a6-2faf551f77ed
To: /content/orderline.csv
100%|██████████| 642M/642M [00:06<00:00, 105MB/s]


'orderline.csv'

In [5]:
file_id = "1Aa5oSSE-3Fn6RQpupqcg2sAf3l2VlccA"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, "person.csv", quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1Aa5oSSE-3Fn6RQpupqcg2sAf3l2VlccA
To: /content/person.csv
100%|██████████| 83.3M/83.3M [00:00<00:00, 98.9MB/s]


'person.csv'

In [6]:
file_id = "1dL388NuXzV8mpTJ44HEmp2LgxKR1d4z8"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, "product.csv", quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1dL388NuXzV8mpTJ44HEmp2LgxKR1d4z8
To: /content/product.csv
100%|██████████| 1.75M/1.75M [00:00<00:00, 148MB/s]


'product.csv'

In [7]:
!ls

orderline.csv  orders.csv  person.csv  product.csv  sample_data


In [8]:
df_person = pd.read_csv("person.csv", sep=";")
df_orders = pd.read_csv("orders.csv", sep=";")
df_orderline = pd.read_csv("orderline.csv", sep=";")
df_product = pd.read_csv("product.csv", sep=";")

In [9]:
print("Person:", df_person.shape)
print("Orders:", df_orders.shape)
print("Orderline:", df_orderline.shape)
print("Product:", df_product.shape)

Person: (600000, 12)
Orders: (5000000, 8)
Orderline: (13000000, 8)
Product: (8000, 18)


In [10]:
df_product.columns

Index(['product_id', 'sku', 'name', 'description', 'category', 'subcategory',
       'brand', 'price', 'cost', 'stock_quantity', 'weight_kg', 'length_cm',
       'width_cm', 'height_cm', 'status', 'created_date', 'rating_average',
       'review_count'],
      dtype='object')

In [12]:
df_person.head(2)
df_orders.head(2)
df_orderline.head(2)
df_product.head(2)

Unnamed: 0,product_id,sku,name,description,category,subcategory,brand,price,cost,stock_quantity,weight_kg,length_cm,width_cm,height_cm,status,created_date,rating_average,review_count
0,1,PRD-KMQL4V-KEE,Cutting Monitor Deluxe,Trendy design reflecting current market trends,Mobile Devices,RidOns,RoastRoyal,181.95,79.95,440,43.46,76.0,120.5,65.6,active,2022-08-07,1.1,4242
1,2,PRD-VD0TAJ-NBG,Elite Phablet v3,Elegant aesthetics with minimalist approach,Reference,Utensils,CableConnect,564.95,857.95,177,14.46,112.9,137.6,67.9,active,2024-11-01,4.7,3981


In [13]:
df_person['person_id'].is_unique
df_orders['order_id'].is_unique
df_product['product_id'].is_unique

True

In [14]:
df_orders['person_id'].isin(df_person['person_id']).mean()
df_orderline['order_id'].isin(df_orders['order_id']).mean()
df_orderline['product_id'].isin(df_product['product_id']).mean()

np.float64(1.0)

In [15]:
order_total_check = (
    df_orderline
    .groupby('order_id')['subtotal']
    .sum()
    .reset_index()
    .merge(df_orders[['order_id', 'total_amount']], on='order_id', how='left')
)

order_total_check['diff'] = (
    order_total_check['subtotal'] - order_total_check['total_amount']
).abs()

order_total_check['diff'].describe()

Unnamed: 0,diff
count,5000000.0
mean,3.546614e-13
std,1.046772e-12
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.455192e-11


In [17]:
df_orders['status'].value_counts()


Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
Cancelled,715293
Returned,714728
Refunded,714600
Delivered,714191
Pending,714017
Shipped,713858
Processing,713313


In [18]:
df_orderline['status'].value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
fulfilled,2602356
cancelled,2600481
returned,2599406
pending,2599167
shipped,2598590


In [19]:
delivered_status = ['Delivered']
loss_status = ['Returned', 'Refunded']
cancel_status = ['Cancelled']


In [20]:
GOV = df_orders['total_amount'].sum()
GOV


np.float64(70376452551.13007)

In [21]:
delivered_revenue = df_orders.loc[
    df_orders['status'].isin(delivered_status),
    'total_amount'
].sum()


In [22]:
returned_refunded_revenue = df_orders.loc[
    df_orders['status'].isin(loss_status),
    'total_amount'
].sum()


In [1]:
NRR = delivered_revenue - returned_refunded_revenue
NRR


NameError: name 'delivered_revenue' is not defined