# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [31]:
# importujemy wszystkie potrzebne pakiety
from more_itertools import powerset

In [32]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.000001

In [33]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [34]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    supports = {}
    
    def add_layer(current_basket: tuple[str], considered_baskets):
        for product in all_products:
            if product in current_basket:
                continue
            new_basket = current_basket + (product, )
            if tuple(sorted(new_basket)) in supports:
                continue
            count = 0
            new_considered_baskets = []
            for basket in considered_baskets:
                if set(new_basket) <= set(basket):
                    count += 1
                    new_considered_baskets.append(basket)
            support = count / len(baskets)
            if support >= epsilon:
                supports[tuple(sorted(new_basket))] = support
                add_layer(new_basket, new_considered_baskets)

    add_layer((), baskets)

    return supports
        

    
supports = get_supports(baskets, products, EPSILON)
supports

{('abrasive cleaner',): 0.0014702933903628951,
 ('abrasive cleaner', 'beef'): 0.00013366303548753594,
 ('abrasive cleaner', 'beef', 'frozen vegetables'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'frozen vegetables',
  'uht-milk'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'pasta'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'pasta', 'pork'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'pasta',
  'pork',
  'salty snack'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'pasta',
  'pork',
  'salty snack',
  'shopping bags'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'pasta',
  'pork',
  'shopping bags'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'pasta', 'salty snack'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'pasta',
  'salty snack',
  'shopping bags'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'pasta', 'shopping bags'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'p

In [35]:

len(supports)

63435

In [36]:
# definiujemy funkcje obliczajace support, confidence i lift


def support(supports, products: tuple[str, ...]) -> float:
    key = tuple(sorted(products))
    return supports.get(key, 0)


def confidence(
    supports, prior_products: tuple[str, ...], following_products: tuple[str]
) -> float:
    if support(supports, prior_products) == 0:
        return 0
    return support(
        supports, tuple(set(prior_products).union(set(following_products)))
    ) / support(supports, prior_products)


def lift(supports, prior_products: tuple[str, ...], following_products: tuple[str]) -> float:
    if (support(supports, prior_products) * support(supports, following_products)) == 0:
        return 0
    return support(
        supports, tuple(set(prior_products).union(set(following_products)))
    ) / (support(supports, prior_products) * support(supports, following_products))

In [37]:
print(support(supports, ('whole milk', 'rolls/buns')))
print(confidence(supports, ('whole milk', 'rolls/buns'), ('yogurt', )))
print(lift(supports, ('whole milk', 'rolls/buns'), ('yogurt', )))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [43]:
from itertools import combinations, chain
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def generate_next_product_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    candidates = []
    for subbasket in chain.from_iterable(combinations(basket, r) for r in range(len(basket)+1)):
        for product in products:
            if product not in subbasket:
                lift_value = lift(supports, subbasket, (product, ))
                if lift_value > 1:
                    conf = confidence(supports, basket, (product, ))
                    if (conf, product) not in candidates:
                        candidates.append((conf, product))
    candidates = sorted(candidates, key=lambda x: x[0], reverse=True)
    return candidates

In [44]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)

{'whole milk', 'semi-finished bread', 'yogurt', 'sausage'}


[(1.0, 'semi-finished bread'),
 (1.0, 'sausage'),
 (1.0, 'whole milk'),
 (1.0, 'yogurt'),
 (0.5, 'other vegetables'),
 (0.0, 'artif. sweetener'),
 (0.0, 'bathroom cleaner'),
 (0.0, 'brandy'),
 (0.0, 'candles'),
 (0.0, 'canned fruit'),
 (0.0, 'canned vegetables'),
 (0.0, 'chocolate marshmallow'),
 (0.0, 'cookware'),
 (0.0, 'detergent'),
 (0.0, 'dish cleaner'),
 (0.0, 'finished products'),
 (0.0, 'hair spray'),
 (0.0, 'ham'),
 (0.0, 'honey'),
 (0.0, 'house keeping products'),
 (0.0, 'light bulbs'),
 (0.0, 'liqueur'),
 (0.0, 'nut snack'),
 (0.0, 'nuts/prunes'),
 (0.0, 'organic products'),
 (0.0, 'popcorn'),
 (0.0, 'ready soups'),
 (0.0, 'rubbing alcohol'),
 (0.0, 'salad dressing'),
 (0.0, 'softener'),
 (0.0, 'syrup'),
 (0.0, 'beef'),
 (0.0, 'beverages'),
 (0.0, 'canned beer'),
 (0.0, 'cereals'),
 (0.0, 'curd cheese'),
 (0.0, 'decalcifier'),
 (0.0, 'dishes'),
 (0.0, 'dog food'),
 (0.0, 'domestic eggs'),
 (0.0, 'flour'),
 (0.0, 'frozen dessert'),
 (0.0, 'frozen fish'),
 (0.0, 'frozen potato

In [40]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'root vegetables', 'domestic eggs', 'yogurt', 'white wine', 'tropical fruit', 'photo/film', 'soda'}


[(1.0, 'photo/film'),
 (1.0, 'domestic eggs'),
 (1.0, 'soda'),
 (1.0, 'white wine'),
 (1.0, 'tropical fruit'),
 (1.0, 'yogurt'),
 (1.0, 'root vegetables'),
 (0.0, 'brandy'),
 (0.0, 'cream'),
 (0.0, 'decalcifier'),
 (0.0, 'dog food'),
 (0.0, 'fish'),
 (0.0, 'flower (seeds)'),
 (0.0, 'frozen dessert'),
 (0.0, 'frozen fruits'),
 (0.0, 'frozen potato products'),
 (0.0, 'frozen vegetables'),
 (0.0, 'grapes'),
 (0.0, 'hygiene articles'),
 (0.0, 'instant food products'),
 (0.0, 'male cosmetics'),
 (0.0, 'mayonnaise'),
 (0.0, 'meat'),
 (0.0, 'nut snack'),
 (0.0, 'nuts/prunes'),
 (0.0, 'organic products'),
 (0.0, 'pet care'),
 (0.0, 'potato products'),
 (0.0, 'processed cheese'),
 (0.0, 'rice'),
 (0.0, 'semi-finished bread'),
 (0.0, 'shopping bags'),
 (0.0, 'skin care'),
 (0.0, 'sliced cheese'),
 (0.0, 'snack products'),
 (0.0, 'softener'),
 (0.0, 'soups'),
 (0.0, 'spread cheese'),
 (0.0, 'sweet spreads'),
 (0.0, 'turkey'),
 (0.0, 'waffles'),
 (0.0, 'whisky'),
 (0.0, 'artif. sweetener'),
 (0.0,