In [8]:
import json
import requests
import numpy as np
import pandas as pd
from data_cleaning import DataCleaning
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

In [2]:
raw_user_df = DataExtractor.read_rds_table('legacy_users')
clean_user_df = DataCleaning.clean_user_data(raw_user_df)


In [5]:
raw_user_df['user_uuid', ].head(10)

index
0    93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1    8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2    fc461df4-b919-48b2-909e-55c95a03fe6b
3    6104719f-ef14-4b09-bf04-fb0c4620acb0
4    9523a6d3-b2dd-4670-a51a-36aebc89f579
5    53d21f46-1fa4-452f-a023-26aee2aae4d6
6    e2066a2c-8cd3-46ad-b2ea-e2445d5d9335
7    bd690c60-c952-40c0-82df-0f8b6797b562
8    02de2416-4baf-42ad-bae6-d716eca0fc3f
9    caffe463-4918-4f45-a37d-856dc0f15884
Name: user_uuid, dtype: object

In [9]:
raw_orders_df = DataExtractor.read_rds_table('orders_table')
clean_orders_df = DataCleaning.clean_orders_data(raw_orders_df)

In [10]:
clean_orders_df[['date_uuid', 'user_uuid']].head(20)

Unnamed: 0_level_0,date_uuid,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9476f17e-5d6a-4117-874d-9cdb38ca1fa6,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,0423a395-a04d-4e4a-bd0f-d237cbd5a295,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,65187294-bb16-4519-adc0-787bbe423970,fc461df4-b919-48b2-909e-55c95a03fe6b
3,579e21f7-13cb-436b-83ad-33687a4eb337,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,00ab86c3-2039-4674-b9c1-adbcbbf525bd,9523a6d3-b2dd-4670-a51a-36aebc89f579
5,c3a1df8f-6918-4795-9f8a-6869b2cda9cf,53d21f46-1fa4-452f-a023-26aee2aae4d6
6,ac94c06e-b5ba-4794-bbf3-3ea9dc0e6b6c,e2066a2c-8cd3-46ad-b2ea-e2445d5d9335
7,27a59013-6ecf-4f3d-86f8-8e972a8f346c,bd690c60-c952-40c0-82df-0f8b6797b562
8,38ebd7b6-b1d4-462e-bfd7-265e3674ef3f,02de2416-4baf-42ad-bae6-d716eca0fc3f
9,e764e21c-f9ce-4e0a-8c01-4218e92e424c,caffe463-4918-4f45-a37d-856dc0f15884


In [2]:
raw_products_df = DataExtractor.extract_from_s3('s3://data-handling-public/products.csv')
interim_products_df = DataCleaning.clean_products_data(raw_products_df)
clean_products_df = DataCleaning.convert_product_weights(interim_products_df)

In [6]:
raw_products_df['weight'].sort_values()

160         0.01kg
186         0.03kg
184         0.06kg
211        0.072kg
297        0.087kg
           ...    
751     9GO9NZ5JTL
575            9kg
853            9kg
1400    MX180RYSHX
1133    Z8ZTDGUZVU
Name: weight, Length: 1849, dtype: object

In [7]:
number_of_stores = DataExtractor.list_number_of_stores('https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores')
raw_stores_df = DataExtractor.retrieve_stores_data(number_of_stores, 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/{store_number}')
clean_stores_df = DataCleaning.clean_store_data(raw_stores_df)

In [21]:
de_store_codes = raw_stores_df['store_code'].loc[raw_stores_df['country_code'] == 'DE']

In [22]:
raw_orders_df[raw_orders_df['store_code'].isin(de_store_codes)]

Unnamed: 0_level_0,date_uuid,user_uuid,card_number,store_code,product_code,product_quantity
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7,27a59013-6ecf-4f3d-86f8-8e972a8f346c,bd690c60-c952-40c0-82df-0f8b6797b562,2321541881278150,OB-FDC2CF69,q8-5387483Z,3
16,f84d4161-6aad-4730-9c90-391ca8db44af,7c8dc654-3994-40b1-a4a2-e55dc7081fe0,4537509987455280000,HA-B6953307,P1-5362711n,1
17,3db24e27-18e7-457e-9bb7-2a1ae81c8e13,3ca7e4af-c9af-4ae8-bf68-cf9e9ddf745c,6011881097126120,CH-7C155EA7,v7-6917295v,3
23,07d20be3-eca5-49a0-8e04-ab64ba757fad,fb47ef8a-7caf-4f43-b227-8fd0354caf85,4305628477334070000,LA-347073D1,w9-8766704u,1
24,0b7145a1-cc1e-478c-bc49-44c625f4eb06,04c29a16-f3b3-4ca8-8971-e67d7863985d,6011112632675670,HE-A49D28DF,f1-4722530k,5
...,...,...,...,...,...,...
66257,ffcaae77-a983-41db-9ea1-65abb4036bc1,45447a05-97cf-42ac-b839-5f1416945e41,4971858637664481,HA-FC4E6AFF,c7-6364718G,1
67926,c9c2a9c7-3ef3-4c8b-8eab-01c5e531039e,4cac12fb-b8f2-46c5-874a-1df4f847c6a2,4971858637664481,VE-1050754E,k9-3755441k,5
90560,0ee2328b-6d46-4c6a-9302-d56b9b0162cc,3aa38cc6-5bd9-4081-9809-d53dc745224d,4252720361802860591,SI-ECD52CD9,T2-5997834g,9
95080,d950d99a-f49f-4a5b-b81d-e4d6570c4fac,b33e2201-48d6-410e-a6c5-dedc039a8c59,575421945446,SC-8724E112,l8-5672498Q,2
