In [119]:
import pandas as pd
import numpy as np
import ds_functions as dsf
import seaborn as sns
import plotly.express as px

tsum_exel = pd.ExcelFile('data/Data_TSUM.xlsx')
df_pars = pd.read_excel(tsum_exel, 'Data_Parsing')
df_comp = pd.read_excel(tsum_exel, 'Data_Company')

rep_chars = '_|-|~|/|\\\\'
df_pars['producer_color'] = df_pars['producer_color'].str.replace(rep_chars,"", regex=True)
df_pars['producer_id'] = df_pars['producer_id'].str.replace(rep_chars,"", regex=True)

df_merged = pd.merge(
    df_pars,
    df_comp[['item_id', 'color_id', 'current price']],
    left_on='producer_id',
    right_on='item_id',
    how='left'
    ).drop(['producer_id', 'producer_color'], axis=1)

df_merged['persent_diff'] = 100 - round(df_merged['price'] / df_merged['current price'] * 100)

grouped_df = df_merged.groupby(['brand', 'Category']).mean().round()
grouped_df


Unnamed: 0_level_0,Unnamed: 1_level_0,price,current price,persent_diff
brand,Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Brioni,Bags,203.0,201.0,-4.0
Brioni,Shoes,238.0,233.0,-5.0
Stone Island,Bags,210.0,218.0,1.0
Stone Island,Shoes,225.0,225.0,-4.0
Valentino,Bags,229.0,232.0,-3.0
Valentino,Shoes,209.0,223.0,-0.0


In [128]:
brand_list = list(df_merged['brand_cat'].value_counts().index)

for brand_name in brand_list:
    mask = df_merged['brand_cat'] == brand_name
    outliers, cleaned = dsf.outliers_iqr_mod(df_merged[mask], 'persent_diff')
    print(f'Tjuki outliers for brand {brand_name}: {outliers.shape[0]}')
    outliers, cleaned = dsf.outliers_z_score_mod(df_merged, 'persent_diff', log_scale=False)
    print(f'Z scale outliers for brand {brand_name}:{outliers.shape[0]}')



Tjuki outliers for brand Valentino/Shoes: 0
Z scale outliers for brand Valentino/Shoes:0
Tjuki outliers for brand Brioni/Shoes: 4
Z scale outliers for brand Brioni/Shoes:0
Tjuki outliers for brand Stone Island/Shoes: 0
Z scale outliers for brand Stone Island/Shoes:0
Tjuki outliers for brand Valentino/Bags: 0
Z scale outliers for brand Valentino/Bags:0
Tjuki outliers for brand Brioni/Bags: 0
Z scale outliers for brand Brioni/Bags:0
Tjuki outliers for brand Stone Island/Bags: 1
Z scale outliers for brand Stone Island/Bags:0


In [120]:
display(df_merged['persent_diff'].describe())

count    75.000000
mean     -2.626667
std      28.754682
min     -68.000000
25%     -18.000000
50%       4.000000
75%      18.500000
max      42.000000
Name: persent_diff, dtype: float64

In [122]:
df_merged[(df_merged['brand'] == 'Brioni') & (df_merged['Category'] == 'Shoes')]

df_merged['brand_cat'] = df_merged['brand'] + '/' + df_merged['Category']


Unnamed: 0,brand,Category,price,item_id,color_id,current price,persent_diff,brand_cat
0,Valentino,Shoes,167,aaaaa111111,black,247,32.0,Valentino/Shoes
1,Valentino,Shoes,188,aaaaa111112,black,161,-17.0,Valentino/Shoes
2,Valentino,Shoes,184,aaaaa111113,black,234,21.0,Valentino/Shoes
3,Valentino,Shoes,196,aaaaa111114,black,167,-17.0,Valentino/Shoes
4,Valentino,Shoes,250,aaaaa111115,black,153,-63.0,Valentino/Shoes
...,...,...,...,...,...,...,...,...
70,Stone Island,Bags,164,sssss111131,red,165,1.0,Stone Island/Bags
71,Stone Island,Bags,158,sssss111132,red,196,19.0,Stone Island/Bags
72,Stone Island,Bags,194,sssss111133,red,236,18.0,Stone Island/Bags
73,Stone Island,Bags,256,sssss111134,red,222,-15.0,Stone Island/Bags


In [127]:
box3 = px.box(
    data_frame = df_merged,
    x = 'persent_diff',
    y = 'brand_cat',
    width = 800,
    height = 400,
    color='brand_cat'
    )

box3.show()