In [7]:
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import datetime as dt
import pandas as pd
import numpy as np
import warnings
import pymssql
import locale
import json
import glob
import os

warnings.filterwarnings('ignore')
pd.set_option('use_inf_as_na', True)
pd.options.display.max_columns = None
pd.set_option('float_format', '{:.2f}'.format)
locale.setlocale(locale.LC_ALL, 'ru_RU.UTF-8')

'ru_RU.UTF-8'

In [8]:
conn = pymssql.connect(server='spb-s-sql-dwh', database='DWH_DEV', charset='WINDOWS-1251')
cursor = conn.cursor()

sql_product = f'''
SELECT 
[name], [article], [data], [category_id], [id], [price], [old_price]
from [DWH_DEV].[ecom].[catalog_product]
'''

sql_sku = f'''
SELECT 
[data], [ext_id], [id]
from [DWH_DEV].[ecom].[catalog_sku]
'''

sql_orders_item = f'''
SELECT 
[data], [price]
from [DWH_DEV].[ecom].[orders_orderitem]
'''

sql_orders_order = f'''
SELECT
[id]
from [DWH_DEV].[ecom].[orders_order]
'''

sql_sku_article = f'''
SELECT
*
from [DWH_DEV].[ecom].[sku_article]
'''

catalog_product = pd.read_sql_query(sql_product, conn)
catalog_sku = pd.read_sql_query(sql_sku, conn)
orders_item = pd.read_sql_query(sql_orders_item, conn)
orders_order = pd.read_sql_query(sql_orders_order, conn)
sku_article = pd.read_sql_query(sql_sku_article, conn)


catalog_product = pd.DataFrame(catalog_product)
catalog_sku = pd.DataFrame(catalog_sku)
orders_item = pd.DataFrame(orders_item)
orders_order = pd.DataFrame(orders_order)
sku_article = pd.DataFrame(sku_article)

In [9]:
catalog_product = catalog_product.query('price > 0')

In [10]:
sku_article.columns = ['ext_id', 'article']
catalog_sku = catalog_sku.merge(sku_article, how='left', on='ext_id')

del sku_article

catalog_sku = catalog_sku.dropna(subset=['article'])

In [11]:
catalog_product = catalog_product.dropna(subset=['price'])

In [12]:
def parse_column(data):
    try:
        return json.loads(data)
    except Exception as e:
        print(e)
        return None

In [13]:
catalog_product['data'] = catalog_product['data'].apply(lambda x: parse_column(x)) 
catalog_sku['data'] = catalog_sku['data'].apply(lambda x: parse_column(x))
orders_item['data'] = orders_item['data'].apply(lambda x: parse_column(x))

Expecting value: line 1 column 19501 (char 19500)
Unterminated string starting at: line 1 column 19499 (char 19498)


In [14]:
catalog_sku['jewel'] = np.nan
catalog_sku['material'] = np.nan
catalog_sku['design'] = np.nan
catalog_sku['type1'] = np.nan
catalog_sku['type2'] = np.nan
catalog_sku['type3'] = np.nan
catalog_sku['product_line'] = np.nan
catalog_sku['product_group'] = np.nan
catalog_sku['margin_groups'] = np.nan


for i in tqdm(catalog_sku.index):
    try: catalog_sku['jewel'][i] = catalog_sku['data'][i]['jewel']['name']
    except: catalog_sku['jewel'][i] = np.nan 
    
    try: catalog_sku['material'][i] = catalog_sku['data'][i]['material']['name']
    except: catalog_sku['material'][i] = np.nan 
    
    try: catalog_sku['design'][i] = catalog_sku['data'][i]['design']['name']
    except: catalog_sku['design'][i] = np.nan 
    
    try: catalog_sku['type1'][i] = catalog_sku['data'][i]['type1']['name']
    except: catalog_sku['type1'][i] = np.nan 
    
    try: catalog_sku['type2'][i] = catalog_sku['data'][i]['type2']['name']
    except: catalog_sku['type2'][i] = np.nan 
    
    try: catalog_sku['type3'][i] = catalog_sku['data'][i]['type3']['name']
    except: catalog_sku['type3'][i] = np.nan 
    
    try: catalog_sku['product_line'][i] = catalog_sku['data'][i]['product_line']['name']
    except: catalog_sku['product_line'][i] = np.nan
    
    try: catalog_sku['product_group'][i] = catalog_sku['data'][i]['product_group']['name']
    except: catalog_sku['product_group'][i] = np.nan
    
    try: catalog_sku['margin_groups'][i] = catalog_sku['data'][i]['margin_groups']['name']
    except: catalog_sku['margin_groups'][i] = np.nan
    
catalog_sku.drop(columns=['data'], inplace = True)
catalog_product = catalog_product.merge(catalog_sku, how='left', on='article')

100%|██████████| 528773/528773 [00:49<00:00, 10774.73it/s]


In [15]:
del catalog_sku

In [33]:
catalog_product['type_for_TG'] = np.nan
catalog_product['type1_from_cp2'] = np.nan
catalog_product['fineness'] = np.nan
catalog_product['brand'] = np.nan 


for i in tqdm(catalog_product.index):
    try: catalog_product['type_for_TG'][i] = catalog_product['data'][i]['kind'][0]['name']
    except: catalog_product['type_for_TG'][i] = np.nan 

    try: catalog_product['type1_from_cp2'][i] = catalog_product['data'][i]['type'][0]['name']
    except: catalog_product['type1_from_cp2'][i] = np.nan 

    try: catalog_product['fineness'][i] = catalog_product['data'][i]['fineness']['description']
    except: catalog_product['fineness'][i] = np.nan 
    
    try: catalog_product['brand'][i] = catalog_product['data'][i]['brand']['name']
    except: catalog_product['brand'][i] = np.nan 

100%|██████████| 317400/317400 [00:20<00:00, 15243.51it/s]


In [34]:
catalog_product.drop(columns=['data'], inplace = True)

1

In [38]:
orders_item['count'] = np.nan 
orders_item['revenue'] = np.nan 
orders_item['sum_cost_price'] = np.nan

for i in tqdm(orders_item.index): 
    try: orders_item['count'][i] = orders_item['data'][i]['balance']['count']
    except: orders_item['count'][i] = np.nan 
    
    try: orders_item['count'][i] = orders_item['data'][i]['balance']['count']
    except: orders_item['count'][i] = np.nan 

{'size': None,
 'balance': {'sku': '320cd0c7-ca3d-429e-b919-570f54d7a0fb',
  'uin': '6432300990288570',
  'count': 1,
  'price': 817.0,
  'weight': 0.57,
  'balance': '2026746b-821f-4b80-b149-cca797083c76',
  'barcode': '2078604673691',
  'vat_group': 'ТОВ20',
  'cost_price': 91.77,
  'net_weight': 0.551,
  'sku_ext_id': 'ТОВ1190623'},
 'balances': [{'sku': '320cd0c7-ca3d-429e-b919-570f54d7a0fb',
   'uin': '6432300990288570',
   'count': 1,
   'price': 817.0,
   'weight': 0.57,
   'balance': '2026746b-821f-4b80-b149-cca797083c76',
   'barcode': '2078604673691',
   'vat_group': 'ТОВ20',
   'cost_price': 91.77,
   'net_weight': 0.551,
   'sku_ext_id': 'ТОВ1190623'}],
 'motivation': {'collection_duration': 15326},
 'promo_code': {'code': '35e6caa5-627b-4aff-a8b7-fc384d23d143',
  'amount': 250.0,
  'discount': 50},
 'bonus_discount': 25}

In [None]:
'''
catalog_product.name,
catalog_product.article,
catalog_sku.data -> 'jewel' ->> 'name' as jewel,
catalog_sku.data -> 'material' ->> 'name' as material,
catalog_sku.data -> 'design' ->> 'name' as design,
catalog_sku.data -> 'type1' ->> 'name' as type1,
catalog_sku.data -> 'type2' ->> 'name' as type2,
catalog_sku.data -> 'type3' ->> 'name' as type3,
catalog_product.data #>> '{fineness, description}' as fineness,
catalog_sku.data -> 'product_line' ->> 'name' as product_line,
catalog_sku.data -> 'product_group' ->> 'name' as product_group,
catalog_product.data #>> '{brand, name}' as brand,
catalog_sku.data -> 'margin_groups' ->> 'name' as margin_group,
catalog_productcategory."name" as online_group,
(catalog_product.data::json -> 'kind' -> 0 ->> 'name'):: text as type_for_TG,
(catalog_product.data::json -> 'type' -> 0 ->> 'name'):: text as type1_from_cp2,
sum((orders_orderitem.data #>> '{balance, count}')::numeric) as count,
sum((orders_orderitem.data #>> '{balance, count}')::numeric * orders_orderitem.price) as revenue,
sum((orders_orderitem.data #>> '{balance, cost_price}')::numeric * (orders_orderitem.data #>> '{balance, count}')::numeric) as sum_cost_price
from orders_orderitem
left join orders_order on orders_orderitem.order_id = orders_order.id
left join catalog_sku on catalog_sku.id = (orders_orderitem.data #>> '{balance, sku}'):: uuid
left join catalog_product on catalog_sku.product_id = catalog_product.id
left join catalog_productcategory on catalog_productcategory.id = catalog_product.category_id
where (orders_order.created > '2024-02-01' and orders_order.created < '2024-05-01')
and orders_orderitem.state = 'done'
and catalog_sku.deleted is null
and orders_order.is_test = 'false'
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
'''