In [1]:
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import datetime as dt
import pandas as pd
import numpy as np
import warnings
import pymssql
import locale
import json
import glob
import os

warnings.filterwarnings('ignore')
pd.set_option('use_inf_as_na', True)
pd.options.display.max_columns = None
pd.set_option('float_format', '{:.2f}'.format)
locale.setlocale(locale.LC_ALL, 'ru_RU.UTF-8')

'ru_RU.UTF-8'

In [2]:
conn = pymssql.connect(server='spb-s-sql-dwh', database='DWH_DEV', charset='WINDOWS-1251')
cursor = conn.cursor()

sql_product = f'''
SELECT 
[name], [article], [data], [category_id], [id], [price], [old_price]
from [DWH_DEV].[ecom].[catalog_product]
'''

sql_sku = f'''
SELECT 
[data], [ext_id], [id], [deleted]
from [DWH_DEV].[ecom].[catalog_sku]
'''

sql_orders_item = f'''
SELECT 
[data], [price], [state], [id], [order_id], [outlet_id], [product_id]
from [DWH_DEV].[ecom].[orders_orderitem]
'''

sql_orders_order = f'''
SELECT
[id], [is_test]
from [DWH_DEV].[ecom].[orders_order]
where ([DWH_DEV].[ecom].[orders_order].[created] > '2024-02-01' and [DWH_DEV].[ecom].[orders_order].[created] < '2024-05-01')
'''

sql_sku_article = f'''
SELECT
*
from [DWH_DEV].[ecom].[sku_article]
'''

catalog_product = pd.read_sql_query(sql_product, conn)
catalog_sku = pd.read_sql_query(sql_sku, conn)
orders_item = pd.read_sql_query(sql_orders_item, conn)
orders_order = pd.read_sql_query(sql_orders_order, conn)
sku_article = pd.read_sql_query(sql_sku_article, conn)


catalog_product = pd.DataFrame(catalog_product)
catalog_sku = pd.DataFrame(catalog_sku)
orders_item = pd.DataFrame(orders_item)
orders_order = pd.DataFrame(orders_order)
sku_article = pd.DataFrame(sku_article)

In [3]:
catalog_product = catalog_product.query('price > 0')
orders_order = orders_order.query('is_test == False')

In [4]:
sku_article.columns = ['ext_id', 'article']
catalog_sku = catalog_sku.merge(sku_article, how='left', on='ext_id')

del sku_article

catalog_sku = catalog_sku.dropna(subset=['article'])

In [5]:
catalog_product = catalog_product.dropna(subset=['price'])

In [6]:
def parse_column(data):
    try:
        return json.loads(data)
    except Exception as e:
        print(e)
        return None

In [7]:
catalog_product['data'] = catalog_product['data'].apply(lambda x: parse_column(x)) 
catalog_sku['data'] = catalog_sku['data'].apply(lambda x: parse_column(x))
orders_item['data'] = orders_item['data'].apply(lambda x: parse_column(x))

Expecting value: line 1 column 19501 (char 19500)
Unterminated string starting at: line 1 column 19499 (char 19498)


In [8]:
catalog_sku['jewel'] = np.nan
catalog_sku['material'] = np.nan
catalog_sku['design'] = np.nan
catalog_sku['type1'] = np.nan
catalog_sku['type2'] = np.nan
catalog_sku['type3'] = np.nan
catalog_sku['product_line'] = np.nan
catalog_sku['product_group'] = np.nan
catalog_sku['margin_groups'] = np.nan


for i in tqdm(catalog_sku.index):
    try: catalog_sku['jewel'][i] = catalog_sku['data'][i]['jewel']['name']
    except: catalog_sku['jewel'][i] = np.nan 
    
    try: catalog_sku['material'][i] = catalog_sku['data'][i]['material']['name']
    except: catalog_sku['material'][i] = np.nan 
    
    try: catalog_sku['design'][i] = catalog_sku['data'][i]['design']['name']
    except: catalog_sku['design'][i] = np.nan 
    
    try: catalog_sku['type1'][i] = catalog_sku['data'][i]['type1']['name']
    except: catalog_sku['type1'][i] = np.nan 
    
    try: catalog_sku['type2'][i] = catalog_sku['data'][i]['type2']['name']
    except: catalog_sku['type2'][i] = np.nan 
    
    try: catalog_sku['type3'][i] = catalog_sku['data'][i]['type3']['name']
    except: catalog_sku['type3'][i] = np.nan 
    
    try: catalog_sku['product_line'][i] = catalog_sku['data'][i]['product_line']['name']
    except: catalog_sku['product_line'][i] = np.nan
    
    try: catalog_sku['product_group'][i] = catalog_sku['data'][i]['product_group']['name']
    except: catalog_sku['product_group'][i] = np.nan
    
    try: catalog_sku['margin_groups'][i] = catalog_sku['data'][i]['margin_groups']['name']
    except: catalog_sku['margin_groups'][i] = np.nan

100%|██████████| 528929/528929 [00:55<00:00, 9614.09it/s] 


In [9]:
catalog_sku.drop(columns=['data'], inplace = True)
catalog_product = catalog_product.merge(catalog_sku, how='left', on='article')

del catalog_sku

In [10]:
catalog_product['type_for_TG'] = np.nan
catalog_product['type1_from_cp2'] = np.nan
catalog_product['fineness'] = np.nan
catalog_product['brand'] = np.nan 


for i in tqdm(catalog_product.index):
    try: catalog_product['type_for_TG'][i] = catalog_product['data'][i]['kind'][0]['name']
    except: catalog_product['type_for_TG'][i] = np.nan 

    try: catalog_product['type1_from_cp2'][i] = catalog_product['data'][i]['type'][0]['name']
    except: catalog_product['type1_from_cp2'][i] = np.nan 

    try: catalog_product['fineness'][i] = catalog_product['data'][i]['fineness']['description']
    except: catalog_product['fineness'][i] = np.nan 
    
    try: catalog_product['brand'][i] = catalog_product['data'][i]['brand']['name']
    except: catalog_product['brand'][i] = np.nan 

100%|██████████| 316929/316929 [00:40<00:00, 7808.91it/s] 


In [11]:
catalog_product.drop(columns=['data'], inplace = True)

In [12]:
orders_item = orders_item.query('state == "done" & price > 0')
orders_item = orders_item.drop(columns=['state'])

In [27]:
orders_item['count'] = np.nan 
orders_item['revenue'] = np.nan 
orders_item['sum_cost_price'] = np.nan
orders_item['size']  = np.nan 

for i in tqdm(orders_item.index): 
    try: orders_item['count'][i] = orders_item['data'][i]['balance']['count']
    except: orders_item['count'][i] = np.nan 
    
    try: orders_item['revenue'][i] = orders_item['price'][i] * orders_item['data'][i]['balance']['count']
    except: orders_item['revenue'][i] = np.nan 
    
    try: orders_item['sum_cost_price'][i] = orders_item['data'][i]['balance']['cost_price'] * orders_item['data'][i]['balance']['count']
    except: orders_item['sum_cost_price'][i] = np.nan
    
    try: orders_item['size'][i] = orders_item['data'][i]['size']
    except:orders_item['size'][i] = np.nan
        

100%|██████████| 1215918/1215918 [13:51<00:00, 1462.53it/s]


In [50]:
catalog_product = catalog_product[catalog_product['deleted'].astype(str) == 'NaT']
catalog_product.drop(columns=['deleted'], inplace=True)

In [77]:
conn = pymssql.connect(server='AX-SQL', database='Staging')
cursor = conn.cursor()

staging = '''
SELECT [Код склада], [Товарное направление], [Товарная группа], [Запрет скидки], [Код товара]
from [Staging].[Reports].[Remainings_8h]
'''
staging = pd.read_sql_query(staging, conn)

In [None]:
staging.rename(columns={
    'Код склада': 'code',
    'Товарное направление': 'tn',
    'Товарная группа': 'tg',
    'Запрет скидки': 'discount',
    'Код товара': 'ext_id'
}, inplace=True)

catalog_product = catalog_product.merge(staging, how='left', on='ext_id')

del staging