# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [2]:
# definiujemy stale

PATH = 'basket.csv'
EPSILON = 0.001
K = 4

In [3]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [39]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def stringify_tuple(tpl: tuple[str]):
    return ','.join(sorted(list(tpl)))

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float) -> dict[tuple, float]:
    result = {}
    basket_tuples_keys = [stringify_tuple(basket) for basket in baskets]
    product_baskets_map = {}
    memo = {}
        
    for basket in baskets:
        for item in basket:
            if item in product_baskets_map:
                product_baskets_map[item].append(basket)
            else:
                product_baskets_map[item] = [basket]
    
    for cur_basket in baskets:
        for possible_basket in powerset(cur_basket):
            possible_basket_key = stringify_tuple(possible_basket)
            if possible_basket_key in memo:
                continue
            occurrences = 0
            set_possible_basket = set(possible_basket)
            for product in possible_basket:
                for basket in product_baskets_map[product]:
                    if set_possible_basket.issubset(basket):
                        occurrences += 1
            calculated_support = occurrences / len(baskets)
            if len(possible_basket) != 0:
                calculated_support /= len(possible_basket)
            memo[possible_basket_key] = calculated_support
            if calculated_support < epsilon:
                continue
            result[possible_basket_key] = calculated_support
    
    return result

supports = get_supports(baskets, products, EPSILON)
count = {1: 0, 2: 0, 3:0, 4:0, 5:0}
for support_key in supports:
    count[support_key.count(',')+1] += 1
    
print(count)

{1: 149, 2: 592, 3: 9, 4: 0, 5: 0}


In [25]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: tuple[str]) -> float:
    basket_key = stringify_tuple(products)
    if basket_key in supports:
        return supports.get(basket_key)
    else:
        occurrences = 0
        products_set = set(products)
        for basket in baskets:
            if products_set.issubset(set(basket)):
                occurrences += 1
        
        support = occurrences / len(baskets)
        supports[baskets] = support
        return support

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    return support(supports, prior_products | following_products) / support(supports, prior_products)
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    return support(supports, prior_products | following_products) / (support(supports, prior_products) * support(supports, following_products))

In [26]:
print(support(supports, {'whole milk', 'rolls/buns'}))  # okolice jednej/dwóch setnych
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))   # mniejszy niz jeden
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [None]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    raise NotImplementedError()

In [None]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

def generate_advanced_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    raise NotImplementedError()

In [None]:
print(baskets[1])
generate_basic_candidates(baskets[1], products, supports)
generate_advanced_candidates(baskets[1], products, supports)

In [None]:
print(baskets[33])
generate_basic_candidates(baskets[33], products, supports)
generate_advanced_candidates(baskets[33], products, supports)