# Algoritmo FP-Growth 
Aplicação do algoritmo FP-Growth para explorar regras de associação em um dataset de transações de supermercado

## Carregamento e Preparação dos Dados

In [None]:
# importação de bibliotecas
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules

In [2]:
# carregando o conjunto de dados
df = pd.read_csv('Groceries_dataset.csv')

In [3]:
df

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
...,...,...,...
36875,2441,28-08-2014,hard cheese
36876,3645,22-08-2014,sugar
36877,2246,21-01-2014,newspapers
36878,4431,12-06-2014,soda


In [None]:
# filtragem por um cliente específico
df[df['Member_number']== 1808] 

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
4355,1808,04-02-2015,long life bakery product
9090,1808,29-11-2014,meat
11488,1808,15-12-2014,sugar
16149,1808,21-07-2015,rolls/buns
20504,1808,04-02-2015,semi-finished bread
25239,1808,29-11-2014,whole milk
27637,1808,15-12-2014,citrus fruit
36088,1808,21-07-2015,candy


In [None]:
# agrupando os itens comprados pelo cliente 1808 por data
agrupado = df[df['Member_number']== 1808].groupby('Date')['itemDescription'].sum()

In [13]:
agrupado

Date
04-02-2015    long life bakery productsemi-finished bread
15-12-2014                              sugarcitrus fruit
21-07-2015                  tropical fruitrolls/bunscandy
29-11-2014                                 meatwhole milk
Name: itemDescription, dtype: object

## Transformando Dados

In [None]:
# argrupando dados por cliente e data criando listas de itens comprados em cada transação
transacoes_agrupadas = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index()

In [18]:
transacoes_agrupadas

Unnamed: 0,Member_number,Date,itemDescription
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1,1000,24-06-2014,"[whole milk, pastry, salty snack]"
2,1000,24-07-2015,"[canned beer, misc. beverages]"
3,1000,25-11-2015,"[sausage, hygiene articles]"
4,1000,27-05-2015,"[soda, pickled vegetables]"
...,...,...,...
14958,4999,24-01-2015,"[tropical fruit, berries, other vegetables, yo..."
14959,4999,26-12-2015,"[bottled water, herbs]"
14960,5000,09-03-2014,"[fruit/vegetable juice, onions]"
14961,5000,10-02-2015,"[soda, root vegetables, semi-finished bread]"


In [None]:
# instanciando o codificador que transforma listas em uma matriz
te = TransactionEncoder()

In [None]:
# transformando as listas de compras em uma matriz booleana (1= item presente 0= item ausente)
te_ary = te.fit(transacoes_agrupadas['itemDescription']).transform(transacoes_agrupadas['itemDescription'])
# criando o dataframe binário, para aplicação do fpgrowth
df_te = pd.DataFrame(te_ary, columns=te.columns_)

In [21]:
df_te

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14958,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
14959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14960,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14961,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Aplicação do FP-Growth

In [None]:
# Encontra conjuntos frequentes de itens com suporte mínimo de 0,5% (parecem em pelo menos 0,5% das transações).
itens_frequentes = fpgrowth(df_te, min_support=0.005, use_colnames=True) # use_colnames=True mostra os nomes dos itens (não os índices)
print(itens_frequentes)

      support                           itemsets
0    0.155784                       (whole milk)
1    0.082671                           (yogurt)
2    0.060349                          (sausage)
3    0.008889              (semi-finished bread)
4    0.048921                           (pastry)
..        ...                                ...
114  0.006616        (whole milk, bottled water)
115  0.005413  (other vegetables, bottled water)
116  0.006616         (whole milk, bottled beer)
117  0.007151         (citrus fruit, whole milk)
118  0.005012                 (whole milk, pork)

[119 rows x 2 columns]


## Geração de Regras de Associação

In [None]:
# Extraindo regras de associação a partir dos conjuntos frequentes, usando a métrica 'confiança', com limiar mínimo de 0,5%
rules = association_rules(itens_frequentes, metric="confidence", min_threshold=0.005)

In [None]:
# friltrando as regras  em que 'soda' está no anrecedente (lado esquerdo da regra)
refrigerante_rules = rules[rules['antecedents'].apply(lambda x: 'soda' in x)]

In [29]:
refrigerante_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
2,(soda),(yogurt),0.092762,0.082671,0.00528,0.056916,0.688473,1.0,-0.002389,0.972692,-0.33278,0.031029,-0.028075,0.06039
15,(soda),(sausage),0.092762,0.060349,0.005881,0.063401,1.050568,1.0,0.000283,1.003258,0.053055,0.039946,0.003248,0.080427
23,(soda),(whole milk),0.092762,0.155784,0.011094,0.119597,0.767706,1.0,-0.003357,0.958896,-0.250105,0.046721,-0.042865,0.095405
24,(soda),(other vegetables),0.092762,0.121567,0.009423,0.101585,0.835633,1.0,-0.001854,0.977759,-0.178178,0.045988,-0.022747,0.08955
26,(soda),(rolls/buns),0.092762,0.105794,0.007418,0.079971,0.755912,1.0,-0.002395,0.971932,-0.262494,0.038811,-0.028878,0.075046
40,(soda),(tropical fruit),0.092762,0.0677,0.005413,0.058357,0.861995,1.0,-0.000867,0.990078,-0.149999,0.034914,-0.010021,0.069159


In [None]:
# ordenando as regras com base na confiança em ordem decrescente
refrigerante_rules_sorted = refrigerante_rules.sort_values(by='confidence', ascending=False)

In [36]:
refrigerante_rules_sorted

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
23,(soda),(whole milk),0.092762,0.155784,0.011094,0.119597,0.767706,1.0,-0.003357,0.958896,-0.250105,0.046721,-0.042865,0.095405
24,(soda),(other vegetables),0.092762,0.121567,0.009423,0.101585,0.835633,1.0,-0.001854,0.977759,-0.178178,0.045988,-0.022747,0.08955
26,(soda),(rolls/buns),0.092762,0.105794,0.007418,0.079971,0.755912,1.0,-0.002395,0.971932,-0.262494,0.038811,-0.028878,0.075046
15,(soda),(sausage),0.092762,0.060349,0.005881,0.063401,1.050568,1.0,0.000283,1.003258,0.053055,0.039946,0.003248,0.080427
40,(soda),(tropical fruit),0.092762,0.0677,0.005413,0.058357,0.861995,1.0,-0.000867,0.990078,-0.149999,0.034914,-0.010021,0.069159
2,(soda),(yogurt),0.092762,0.082671,0.00528,0.056916,0.688473,1.0,-0.002389,0.972692,-0.33278,0.031029,-0.028075,0.06039
