# Сказ о пиве и подгузниках

Если очень сильно хочется, можно анализировать покупки людей в магазинах. Именно этим мы в этой небольшой тетрадке и займёмся. 

In [92]:
import pandas as pd
import numpy as np

Подружаем данные и смотрим как они выглядят.

In [93]:
df = pd.read_csv('groceries.csv', sep=';', header=-1)
purches = df[0].get_values()
purches

array(['citrus fruit,semi-finished bread,margarine,ready soups',
       'tropical fruit,yogurt,coffee', 'whole milk', ...,
       'chicken,citrus fruit,other vegetables,butter,yogurt,frozen dessert,domestic eggs,rolls/buns,rum,cling film/bags',
       'semi-finished bread,bottled water,soda,bottled beer',
       'chicken,tropical fruit,other vegetables,vinegar,shopping bags'],
      dtype=object)

Наши данные - это длинные списки из покупок. Наример, в один из первых чеков вошли вот такие вот продукты: 

In [94]:
purches[0]

'citrus fruit,semi-finished bread,margarine,ready soups'

Давайте немного упорядочим наши данные. Превратим их из списочного вида в табличку. По строкам будут наблюдения, по столбцам названия продуктов. Если продукт был куплен в каком-то чеке, в колонке будет стоять $1$. Если не был, $0$.

In [26]:
purches_dicts = [ ]
for item in purches:
    cur_dict = { }
    for jtem in item.split(','):
        cur_dict[jtem] = 1
    purches_dicts.append(cur_dict)

Итак, итоговая таблица покупок. 

In [32]:
df = pd.DataFrame(purches_dicts).fillna(0)
print(df.shape)
df.head()

(9835, 169)


Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


Какие продукты в выборке встречаются чаще всего.

In [45]:
df.sum(axis=0).sort_values(ascending=False)[:20]

whole milk               2513.0
other vegetables         1903.0
rolls/buns               1809.0
soda                     1715.0
yogurt                   1372.0
bottled water            1087.0
root vegetables          1072.0
tropical fruit           1032.0
shopping bags             969.0
sausage                   924.0
pastry                    875.0
citrus fruit              814.0
bottled beer              792.0
newspapers                785.0
canned beer               764.0
pip fruit                 744.0
fruit/vegetable juice     711.0
whipped/sour cream        705.0
brown bread               638.0
domestic eggs             624.0
dtype: float64

Матрица скоррелированности продуктов.

In [52]:
correlations = df.corr()
correlations.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Instant food products,1.0,0.008598,-0.005378,-0.005141,-0.002223,-0.000907,-0.001815,0.013845,-0.004721,0.019695,...,0.017214,0.006884,-0.000215,0.005904,-0.002568,0.026516,-0.012528,0.025631,0.009795,0.033731
UHT-milk,0.008598,1.0,-0.011118,0.029089,-0.004596,-0.001876,-0.003753,0.013637,0.011853,-0.008271,...,0.014629,0.013074,0.024576,0.014065,-0.005308,-0.002391,-0.013479,-0.058426,0.044235,0.025421
abrasive cleaner,-0.005378,-0.011118,1.0,0.026568,-0.001477,-0.000603,-0.001205,-0.00802,0.029497,0.031886,...,-0.005412,0.037635,0.005816,0.036346,-0.001705,0.012982,0.004182,0.027627,0.02522,-0.004986
artif. sweetener,-0.005141,0.029089,0.026568,1.0,-0.001412,-0.000576,-0.001152,0.032963,-0.002998,-0.005436,...,-0.005174,0.017582,-0.002135,-0.008955,-0.00163,0.005806,-0.007954,0.011558,0.033681,-0.004767
baby cosmetics,-0.002223,-0.004596,-0.001477,-0.001412,1.0,-0.000249,-0.000498,0.02792,-0.001296,-0.005814,...,-0.002237,-0.002,0.016481,0.009097,-0.000705,0.015327,-0.00344,0.01385,-0.009948,-0.002062


Посмотрим какое именно пиво есть в выборке.

In [51]:
[item for item in df.columns.values if 'beer' in item]

['bottled beer', 'canned beer']

Коррелирующие продукты для бутылочного пива.

In [60]:
correlations['bottled beer'].sort_values(ascending = False).head(10)

bottled beer            1.000000
liquor                  0.132855
red/blush wine          0.089220
bottled water           0.080402
prosecco                0.052998
liquor (appetizer)      0.049366
tea                     0.047823
specialty vegetables    0.041658
roll products           0.036569
brandy                  0.033047
Name: bottled beer, dtype: float64

Наоборот, продукты, которые мешают купить бутылочное пиво.

In [78]:
correlations['bottled beer'].sort_values(ascending = True).head(10)

canned beer          -0.049591
pastry               -0.033420
butter milk          -0.029794
shopping bags        -0.028878
whipped/sour cream   -0.025744
cream cheese         -0.025670
specialty bar        -0.022136
berries              -0.021536
candy                -0.021230
liver loaf           -0.021155
Name: bottled beer, dtype: float64

Прлукты, которые чаще всего покупают с баночным пивом.

In [61]:
correlations['canned beer'].sort_values(ascending = False).head(10)

canned beer           1.000000
liquor (appetizer)    0.046854
shopping bags         0.046811
male cosmetics        0.030982
red/blush wine        0.028549
baby cosmetics        0.023598
brandy                0.016596
liqueur               0.016343
cream                 0.010352
syrup                 0.010100
Name: canned beer, dtype: float64

Посмотрим ещё на парочку продуктов.

In [63]:
correlations['coffee'].sort_values(ascending = False).head(10)

coffee                  1.000000
condensed milk          0.082529
sugar                   0.066510
cling film/bags         0.059407
UHT-milk                0.055370
shopping bags           0.052144
kitchen towels          0.048280
oil                     0.044692
specialty vegetables    0.042004
pickled vegetables      0.041921
Name: coffee, dtype: float64

In [79]:
correlations['coffee'].sort_values(ascending = True).head(10)

frozen fish              -0.018915
beverages                -0.018740
finished products        -0.014685
white wine               -0.012279
organic sausage          -0.011755
prosecco                 -0.011207
flower soil/fertilizer   -0.010923
ready soups              -0.010631
rum                      -0.010128
decalcifier              -0.009703
Name: coffee, dtype: float64

Какой-нибудь рандомный продукт...

In [90]:
index = np.random.randint(0, df.shape[1],1)[0]
product = df.columns[index]
print(product)
correlations[product].sort_values(ascending = False).head(10)

baby cosmetics


cream             0.112438
dish cleaner      0.078359
cookware          0.077403
syrup             0.070897
kitchen towels    0.051406
soups             0.048015
butter milk       0.045765
oil               0.045669
sweet spreads     0.041124
mustard           0.035099
Name: baby cosmetics, dtype: float64

In [91]:
correlations[product].sort_values(ascending = True).head(10)

yogurt                  -0.009948
bottled water           -0.008709
sausage                 -0.007956
pastry                  -0.007721
bottled beer            -0.007312
newspapers              -0.007277
fruit/vegetable juice   -0.006897
brown bread             -0.006507
domestic eggs           -0.006431
frankfurter             -0.006185
Name: baby cosmetics, dtype: float64

Можно находить среди продуктов, на основе скоррелированности целые кластеры покупаемых вместе продуктов. А ещё все гипотезы о корреляциях должны по-честному тестироваться. Но это уже совсем другая история, которая не подходит для коротенького последнего занятия. 

![](https://pp.userapi.com/c638028/v638028181/52e5e/1X-dkzNN1hk.jpg)