In [1]:
import numpy as np
import pandas as pd

from collections import defaultdict
from itertools import chain, combinations
from optparse import OptionParser

In [3]:
df = pd.read_csv('C:/Prerna Tulsiani/COEP/Sem6/Data Science Project/Final Project/train.csv')
df.head()

Unnamed: 0,label,achiote paste,achiote powder,acini di pepe,acorn squash,active dry yeast,adobo sauce,adobo seasoning,adzuki beans,agave nectar,...,yellow rock sugar,yellow squash,yellow summer squash,yellow tomato,yellowfin tuna,yogurt cheese,yucca root,yukon gold potatoes,yuzu juice,zucchini blossoms
0,Infineon Raceway Baked Beans,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Sour Cream Noodle Bake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Middle-Eastern Eggplant Rounds,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Saffron Jewel Rice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Thai Sweet and Sour Wings,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
features = df.iloc[:, 1:]
labels = df.iloc[:, 0]

In [5]:
def powerset(s):
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)))


def getAboveMinSup(itemSet, itemSetList, minSup, globalItemSetWithSup):
    freqItemSet = set()
    localItemSetWithSup = defaultdict(int)

    for item in itemSet:
        for itemSet in itemSetList:
            if item.issubset(itemSet):
                globalItemSetWithSup[item] += 1
                localItemSetWithSup[item] += 1

    for item, supCount in localItemSetWithSup.items():
        support = float(supCount / len(itemSetList))
        if(support >= minSup):
            freqItemSet.add(item)

    return freqItemSet


def getUnion(itemSet, length):
    return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length])


def pruning(candidateSet, prevFreqSet, length):
    tempCandidateSet = candidateSet.copy()
    for item in candidateSet:
        subsets = combinations(item, length)
        for subset in subsets:
            # if the subset is not in previous K-frequent get, then remove the set
            if(frozenset(subset) not in prevFreqSet):
                tempCandidateSet.remove(item)
                break
    return tempCandidateSet


def associationRule(freqItemSet, recipes, labels, itemSetWithSup, minConf):
    rules = []
    lbls = []
    
    sets_compare = [set(re) for re in recipes]
    
    for k, itemSet in freqItemSet.items():
        for item in itemSet:
            subsets = powerset(item)
            for s in subsets:
                
                ss = set([es for es in s])
                if(len(ss.intersection(sets_compare[0])) > 0 and len(ss.intersection(sets_compare[1])) > 0):
                    lbls.append([labels[0], labels[1]])
                if(len(ss.intersection(sets_compare[0])) > 0):
                    lbls.append(labels[0])
                if(len(ss.intersection(sets_compare[1])) > 0):
                    lbls.append(labels[1])
                    
                confidence = float(
                    itemSetWithSup[item] / itemSetWithSup[frozenset(s)])
                if(confidence > minConf):
                    rules.append([set(s), set(item.difference(s)), confidence])
    return rules, lbls


def getItemSetFromList(itemSetList):
    tempItemSet = set()

    for itemSet in itemSetList:
        for item in itemSet:
            tempItemSet.add(frozenset([item]))

    return tempItemSet

In [6]:
def apriori(itemSetList, labels, minSup, minConf):
    C1ItemSet = getItemSetFromList(itemSetList)
    # Final result global frequent itemset
    globalFreqItemSet = dict()
    # Storing global itemset with support count
    globalItemSetWithSup = defaultdict(int)

    L1ItemSet = getAboveMinSup(
        C1ItemSet, itemSetList, minSup, globalItemSetWithSup)
    currentLSet = L1ItemSet
    k = 2

    # Calculating frequent item set
    while(currentLSet):
        # Storing frequent itemset
        globalFreqItemSet[k-1] = currentLSet
        # Self-joining Lk
        candidateSet = getUnion(currentLSet, k)
        # Perform subset testing and remove pruned supersets
        candidateSet = pruning(candidateSet, currentLSet, k-1)
        # Scanning itemSet for counting support
        currentLSet = getAboveMinSup(
            candidateSet, itemSetList, minSup, globalItemSetWithSup)
        k += 1

    rules, lbls = associationRule(globalFreqItemSet, itemSetList, labels, globalItemSetWithSup, minConf)
    #rules.sort(key=lambda x: x[2])

    return globalFreqItemSet, rules, lbls

In [7]:
idx_to_column = {k:v for k,v in enumerate(features.columns.values)}
column_to_idx = {v:k for k,v in enumerate(features.columns.values)}

In [8]:
def get_random_recipes(features, labels, nr):
    ids = np.random.randint(0, len(features), size=nr).tolist()
    idxs = np.array([np.arange(len(features.columns.values)).tolist() for x in range(nr)])
    multiply = features.iloc[ids].values.astype(np.int32) * idxs
    idx_arr  = [np.where(mult != 0)[0].tolist() for mult in multiply]
    recipes = []
    for arr_idx in idx_arr:
        temp_result = []
        for idx in arr_idx:
            temp_result.append(idx_to_column[idx])
        recipes.append(temp_result)
    return recipes, labels[ids].values

In [9]:
def print_recipe(recipe, label):
    print("-" * 50)
    print("Recipe: ", label)
    print("-" * 50)
    for item in recipe:
        print(item)

In [10]:
# get 2 random receipes
recipes, lbls = get_random_recipes(features, labels, nr=2)

# print recipes
pr = [print_recipe(rec, lbl) for rec, lbl in list(zip(recipes,lbls))]

--------------------------------------------------
Recipe:  Tuscan Pasta Salad With Grilled Vegetables
--------------------------------------------------
cannellini beans
extra-virgin olive oil
fennel fronds
fresh parsley
freshly ground pepper
parmesan cheese
--------------------------------------------------
Recipe:  Champari Gold
--------------------------------------------------
navel oranges


In [11]:
freqItemSet, rules, lbls = apriori(recipes, lbls, minSup=0.5, minConf=0.5)

In [12]:
def find_recipe(ingredients, rules, lbls):
    rules_dic = {}
    for i, item in enumerate(np.array(rules)[:, :-1]):
        item = [list(i) for i in item]
        rules_dic[str(item)] = lbls[i]
    
    result = None
    
    try:
        result = rules_dic[str(ingredients)]
    except:
        result = "Recipe not found"
        
    return result

In [13]:
print('Try to find recipe using ingredients: {0} and {1}'.format(recipes[0][0], recipes[0][1]))
found_recipe = find_recipe([[recipes[0][0]], [recipes[0][1]]], rules, lbls)
print('Found recipe: ', found_recipe)

Try to find recipe using ingredients: cannellini beans and extra-virgin olive oil
Found recipe:  Tuscan Pasta Salad With Grilled Vegetables


In [14]:
ingredients_to_try = [['vanilla vodka'], ['coconut extract']]
print('Testing not finding ingredients')
print('Using the following ingredients: {0} and {1}'.format(ingredients_to_try[0][0], ingredients_to_try[1][0]))
print(find_recipe(ingredients_to_try, rules, lbls))

Testing not finding ingredients
Using the following ingredients: vanilla vodka and coconut extract
Recipe not found
