In [4]:
from bs4 import BeautifulSoup as bs
import datetime as dt
import pandas as pd
import numpy as np
import warnings
import pymssql
import locale
import json
import glob
import os

warnings.filterwarnings('ignore')
pd.set_option('use_inf_as_na', True)
pd.options.display.max_columns = None
pd.set_option('float_format', '{:.2f}'.format)
locale.setlocale(locale.LC_ALL, 'ru_RU.UTF-8')

'ru_RU.UTF-8'

In [5]:
conn = pymssql.connect(server='spb-s-sql-dwh', database='DWH_DEV', charset='WINDOWS-1251')
cursor = conn.cursor()

sql_product = f'''
SELECT 
[name], [article], [data]
from [DWH_DEV].[ecom].[catalog_product]
'''

sql_sku = f'''
SELECT 
[data]
from [DWH_DEV].[ecom].[catalog_sku]
'''

sql_orders_item = f'''
SELECT 
[data]
from [DWH_DEV].[ecom].[orders_orderitem]
'''

sql_orders_order = f'''
SELECT
[id]
from [DWH_DEV].[ecom].[orders_order]
'''

catalog_product = pd.read_sql_query(sql_product, conn)
catalog_sku = pd.read_sql_query(sql_sku, conn)
orders_item = pd.read_sql_query(sql_orders_item, conn)
orders_order = pd.read_sql_query(sql_orders_order, conn)


catalog_product = pd.DataFrame(catalog_product)
catalog_sku = pd.DataFrame(catalog_sku)
orders_item = pd.DataFrame(orders_item)
orders_order = pd.DataFrame(orders_order)

In [6]:
def parse_column(data):
    try:
        return json.loads(data)
    except Exception as e:
        print(e)
        return None

In [7]:
catalog_product['data'] = catalog_product['data'].apply(lambda x: parse_column(x)) 
catalog_sku['data'] = catalog_sku['data'].apply(lambda x: parse_column(x))
orders_item['data'] = orders_item['data'].apply(lambda x: parse_column(x))

Expecting value: line 1 column 19501 (char 19500)
Unterminated string starting at: line 1 column 19499 (char 19498)


In [None]:
catalog_product.name,
catalog_product.article,
catalog_sku.data -> 'jewel' ->> 'name' as jewel,
catalog_sku.data -> 'material' ->> 'name' as material,
catalog_sku.data -> 'design' ->> 'name' as design,
catalog_sku.data -> 'type1' ->> 'name' as type1,
catalog_sku.data -> 'type2' ->> 'name' as type2,
catalog_sku.data -> 'type3' ->> 'name' as type3,
catalog_product.data #>> '{fineness, description}' as fineness,
catalog_sku.data -> 'product_line' ->> 'name' as product_line,
catalog_sku.data -> 'product_group' ->> 'name' as product_group,
catalog_product.data #>> '{brand, name}' as brand,
catalog_sku.data -> 'margin_groups' ->> 'name' as margin_group,
catalog_productcategory."name" as online_group,
(catalog_product.data::json -> 'kind' -> 0 ->> 'name'):: text as type_for_TG,
(catalog_product.data::json -> 'type' -> 0 ->> 'name'):: text as type1_from_cp2,
sum((orders_orderitem.data #>> '{balance, count}')::numeric) as count,
sum((orders_orderitem.data #>> '{balance, count}')::numeric * orders_orderitem.price) as revenue,
sum((orders_orderitem.data #>> '{balance, cost_price}')::numeric * (orders_orderitem.data #>> '{balance, count}')::numeric) as sum_cost_price
from orders_orderitem
left join orders_order on orders_orderitem.order_id = orders_order.id
left join catalog_sku on catalog_sku.id = (orders_orderitem.data #>> '{balance, sku}'):: uuid
left join catalog_product on catalog_sku.product_id = catalog_product.id
left join catalog_productcategory on catalog_productcategory.id = catalog_product.category_id
where (orders_order.created > '2024-02-01' and orders_order.created < '2024-05-01')
and orders_orderitem.state = 'done'
and catalog_sku.deleted is null
and orders_order.is_test = 'false'
group by 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16


In [16]:
catalog_sku['jewel'] = np.nan
catalog_sku['material'] = np.nan
catalog_sku['design'] = np.nan
catalog_sku['type1'] = np.nan
catalog_sku['type2'] = np.nan
catalog_sku['type3'] = np.nan
catalog_sku['product_line'] = np.nan
catalog_sku['product_group'] = np.nan
catalog_sku['margin_groups'] = np.nan


for i in catalog_sku.index:
    try: catalog_sku['jewel'][i] = catalog_sku['data'][i]['jewel']['name']
    except: catalog_sku['jewel'][i] = np.nan 
    
    try: catalog_sku['material'][i] = catalog_sku['data'][i]['material']['name']
    except: catalog_sku['material'][i] = np.nan 
    
    try: catalog_sku['design'][i] = catalog_sku['data'][i]['design']['name']
    except: catalog_sku['design'][i] = np.nan 
    
    try: catalog_sku['type1'][i] = catalog_sku['data'][i]['type1']['name']
    except: catalog_sku['type1'][i] = np.nan 
    
    try: catalog_sku['type2'][i] = catalog_sku['data'][i]['type2']['name']
    except: catalog_sku['type2'][i] = np.nan 
    
    try: catalog_sku['type3'][i] = catalog_sku['data'][i]['type3']['name']
    except: catalog_sku['type3'][i] = np.nan 
    
    try: catalog_sku['product_line'][i] = catalog_sku['data'][i]['product_line']['name']
    except: catalog_sku['product_line'][i] = np.nan
    
    try: catalog_sku['product_group'][i] = catalog_sku['data'][i]['product_group']['name']
    except: catalog_sku['product_group'][i] = np.nan
    
    try: catalog_sku['margin_groups'][i] = catalog_sku['data'][i]['margin_groups']['name']
    except: catalog_sku['margin_groups'][i] = np.nan

In [18]:
catalog_sku.drop(columns=['data'], inplace = True)

In [19]:
catalog_sku

Unnamed: 0,jewel,material,design,type1,type2,type3,product_line,product_group,margin_groups
0,ТОПАЗ,Золото,ДК ЛЕПЕСТКИ,КОЛЬЦО,С ПОЛУДРАГОЦЕННЫМИ КАМНЯМИ,КРАСНЫЙ,ПДК,,ДК_БАЗОВАЯ
1,ФИАНИТ БЕСЦВЕТ,Золото,ИФ МНОГОКАМЕНКА,ПОДВЕС ДЕКОРАТИВНЫЙ,С ФИАНИТАМИ,КРАСНЫЙ,ИФ,ИФ ПОДВЕС ДЕКОР,ШОК_ЦЕНА
2,ФИАНИТ БЕСЦВЕТ,Золото,ИФ ПРОЧЕЕ,КОЛЬЦО,С ФИАНИТАМИ,КРАСНЫЙ,ИФ,ИФ КОЛЬЦА,ИФ.БК_ДОР
3,ФИАНИТ БЕСЦВЕТ,Золото,ИФ БЕЛОЕ,СЕРЬГИ,С ФИАНИТАМИ,БЕЛЫЙ,ИФ,ИФ СЕРЬГИ,ИФ.БК_ДОР
4,ТОПАЗ,Золото,ДК С ОДНИМ КАМНЕМ,КОЛЬЦО,С ПОЛУДРАГОЦЕННЫМИ КАМНЯМИ,КРАСНЫЙ,ПДК,ПДК КОЛЬЦА,ДК_БАЗОВАЯ
...,...,...,...,...,...,...,...,...,...
1139415,,Золото,,СЕРЬГИ,С ФИАНИТАМИ,КРАСНЫЙ,ИФ,,ИФ.БК_ДЕШ
1139416,ФИАНИТ БЕСЦВЕТ,Серебро,СИ ДОРОЖКА,КОЛЬЦО,С ФИАНИТАМИ,БЕЛЫЙ С РОДИРОВАНИЕМ,СИ,СИ КОЛЬЦО ИФ,СИ_КОЛЬЦА_СРЕДН
1139417,,Золото,,КОЛЬЦО,С ФИАНИТАМИ,КРАСНЫЙ,ИФ,,ИФ.БК_ДЕШ
1139418,,Золото,ИФ БРЕНД,КОЛЬЦО,БЕЗ КАМНЕЙ,БЕЛЫЙ,БК,ОПТ БК,ОПТ_БК
