# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [3]:
from email.policy import default

# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [4]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [5]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [35]:
from collections import defaultdict


# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float) -> dict[frozenset[str], float]:
    supports = {}
    baskets_by_products = _get_baskets_by_product(baskets)
    
    for basket in baskets:
        for sub_basket in powerset(basket):
            if len(sub_basket) > K:
                break
                
            sub_basket_set = frozenset(sub_basket)
            if sub_basket_set in supports:
                continue
                
            occurrences = 0
            for product in sub_basket:
                for b in baskets_by_products[product]:
                    if sub_basket_set.issubset(set(b)):
                        # basket will occur for every product in sub_basket
                        # so we need to divide `occurrences` by the number of products in sub_basket
                        occurrences += 1
            
            support = occurrences / len(baskets)
            if len(sub_basket) > 0:
                support /= len(sub_basket)
                
            # ideally if support of sub_basket is less than epsilon, we should
            # not calculate the values for baskets that contain this sub_basket
            if support <= epsilon: 
                continue
                
            supports[sub_basket_set] = support

            
    return supports
            
            
def _get_baskets_by_product(baskets: list[tuple[str]]) -> dict[str, list[tuple[str]]]:
    baskets_by_products = defaultdict(list)
    for basket in baskets:
        for product in basket:
            baskets_by_products[product].append(basket)
            
    return baskets_by_products
                
                
    
supports = get_supports(baskets, products, EPSILON)
supports

{frozenset({'salty snack'}): 0.018779656485998796,
 frozenset({'pastry'}): 0.0517275947336764,
 frozenset({'whole milk'}): 0.15792287642852368,
 frozenset({'salty snack', 'whole milk'}): 0.0019381140145692708,
 frozenset({'pastry', 'whole milk'}): 0.006482657221145492,
 frozenset({'yogurt'}): 0.08587850030074183,
 frozenset({'sausage'}): 0.06034886052262247,
 frozenset({'semi-finished bread'}): 0.009490075519615051,
 frozenset({'sausage', 'yogurt'}): 0.005747510525964045,
 frozenset({'whole milk', 'yogurt'}): 0.011160863463209249,
 frozenset({'sausage', 'whole milk'}): 0.008955423377664907,
 frozenset({'semi-finished bread', 'whole milk'}): 0.001670787943594199,
 frozenset({'sausage', 'whole milk', 'yogurt'}): 0.0014702933903628951,
 frozenset({'soda'}): 0.09710619528169484,
 frozenset({'pickled vegetables'}): 0.008955423377664907,
 frozenset({'misc. beverages'}): 0.01577223818752924,
 frozenset({'canned beer'}): 0.04691572545612511,
 frozenset({'hygiene articles'}): 0.0137004611374724

In [36]:
len(supports)

750

In [37]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports: dict[frozenset[str], float], products: tuple[str]) -> float:
    return supports.get(frozenset(products), 0.0)


def confidence(supports: dict[frozenset[str], float], prior_products: tuple[str], following_products: tuple[str]) -> float:
    try:
        return support(supports, prior_products + following_products) / support(supports, prior_products)
    except ZeroDivisionError:
        return 0.0


def lift(supports:  dict[frozenset[str], float], prior_products: tuple[str], following_products: tuple[str]) -> float:
    try:
        return support(supports, prior_products + following_products) / (support(supports, prior_products) * support(supports, following_products))
    except ZeroDivisionError:
        return 0.0

In [38]:
print(support(supports, ('whole milk', 'rolls/buns')))
print(confidence(supports, ('whole milk', 'rolls/buns'), ('yogurt',)))
print(lift(supports, ('whole milk', 'rolls/buns'), ('yogurt',)))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [39]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: tuple[str], products: list[str], supports: dict[frozenset[str], float]) -> list[tuple[str, tuple[str], float, float]]:
    candidates: list[tuple[str, tuple[str], float, float]] = []
    for sub_basket in powerset(basket):
        if len(sub_basket) == 0:
            continue
        for product in products:
            if product in basket:
                continue
            confidence_val = confidence(supports, sub_basket, (product,))
            lift_val = lift(supports, sub_basket, (product,))

            if lift_val > 1:
                candidates.append((product, sub_basket, confidence_val, lift_val))

    return sorted(candidates, key=lambda x: x[2], reverse=True)

In [40]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

def generate_advanced_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str] | None, float, float]]:
    candidates = generate_basic_candidates(basket, products, supports)
    product_confidences = defaultdict(list)
    
    for product, _, confidence, _ in candidates:
        product_confidences[product].append(confidence)
        
    for product, confidences in product_confidences.items():
        candidates.append(
            (product, None, sum(confidences) / len(confidences), 1)
        )
    
    return sorted(candidates, key=lambda x: x[2] * x[3], reverse=True)


In [42]:
print(baskets[1])
generate_basic_candidates(tuple(baskets[1]), products, supports)
generate_advanced_candidates(tuple(baskets[1]), products, supports)

{'yogurt', 'sausage', 'semi-finished bread', 'whole milk'}


[('soda', ('sausage', 'whole milk'), 0.11940298507462685, 1.2296124333596985),
 ('rolls/buns',
  ('sausage', 'whole milk'),
  0.12686567164179105,
  1.1532752398396837),
 ('rolls/buns',
  ('yogurt', 'whole milk'),
  0.11976047904191618,
  1.0886853267947703),
 ('rolls/buns', None, 0.12331307534185362, 1),
 ('soda', None, 0.10898166972446735, 1),
 ('soda', ('sausage',), 0.09856035437430785, 1.0149749363405152),
 ('curd', ('sausage',), 0.04872646733111849, 1.4466153386419167),
 ('bottled beer', ('sausage',), 0.05537098560354374, 1.2220000849348451),
 ('bottled beer', None, 0.05537098560354374, 1),
 ('pastry', ('sausage',), 0.05315614617940199, 1.0276168156103256),
 ('citrus fruit', ('yogurt',), 0.053696498054474705, 1.0106423904265471),
 ('citrus fruit', None, 0.053696498054474705, 1),
 ('pastry', None, 0.05315614617940199, 1),
 ('curd', None, 0.04872646733111849, 1),
 ('frozen vegetables', ('sausage',), 0.03433001107419712, 1.2259664813919129),
 ('beverages', ('sausage',), 0.02547065337

In [44]:
print(baskets[33])
generate_basic_candidates(tuple(baskets[33]), products, supports)
generate_advanced_candidates(tuple(baskets[33]), products, supports)

{'yogurt', 'photo/film', 'white wine', 'domestic eggs', 'root vegetables', 'soda', 'tropical fruit'}


[('sausage', ('yogurt',), 0.0669260700389105, 1.1089864739670185),
 ('sausage', None, 0.06408932545304094, 1),
 ('sausage', ('soda',), 0.06125258086717137, 1.0149749363405152),
 ('citrus fruit', ('yogurt',), 0.053696498054474705, 1.0106423904265471),
 ('citrus fruit', None, 0.053696498054474705, 1),
 ('shopping bags',
  ('root vegetables',),
  0.04803073967339097,
  1.0093875810856026),
 ('shopping bags', None, 0.04803073967339097, 1),
 ('coffee', ('domestic eggs',), 0.03783783783783784, 1.1969716016227643),
 ('newspapers', ('domestic eggs',), 0.04144144144144145, 1.0654437943097737),
 ('newspapers', None, 0.04144144144144145, 1),
 ('frankfurter', ('domestic eggs',), 0.03783783783783784, 1.0020664912700312),
 ('coffee', None, 0.03783783783783784, 1),
 ('frankfurter', None, 0.03783783783783784, 1),
 ('frozen vegetables',
  ('root vegetables',),
  0.030739673390970224,
  1.0977511526231203),
 ('frozen vegetables', None, 0.030739673390970224, 1),
 ('white bread', ('domestic eggs',), 0.027