##### Help articles
* https://pbpython.com/market-basket-analysis.html

In [76]:
! tree

.
├── Data
│   └── Book1.xlsx
├── Market Basket.ipynb
├── market_basket.py
├── out.csv
├── __pycache__
│   └── market_basket.cpython-36.pyc
└── Question1.ipynb

2 directories, 6 files


In [77]:
import pandas as pd
# from pandas import DataFrame


In [78]:
df=pd.read_excel("./Data/Book1.xlsx")
print("Transcations:", len(df))
df.head()

Transcations: 2928


Unnamed: 0,transaction #,day week,vegetables,baby,fruit,milk,dvds,meat
0,1,5,1,0,0,1,0,1
1,2,4,1,1,1,1,0,0
2,3,5,1,0,0,0,0,0
3,4,5,1,0,1,0,0,0
4,5,7,1,1,0,1,0,1


In [79]:

def set_internal_multiplier(df, load_set):
    
    ## Intialize series
    df0=df[load_set[0]]
    
    ## Multiply series
    for col in load_set: 
        df0=df0*df[col]

    ## Return Series
    return df0

def support_calculator(df, setA,setB):
    
    setA_series=set_internal_multiplier(df, setA)
    setB_series=set_internal_multiplier(df, setB)
    
    support= (sum(setA_series*setB_series)) /len(setA_series) 
    return support

def identity_creater(df):
    df["I"]=1
    return df

def confidence_calculator(df, setA, setB):
    
    #### Numerator
    support_AB=support_calculator(df, setA, setB)
    
    
    #### Denominator
    df=identity_creater(df)
    support_B=support_calculator(df,setB,"I")
    
    #### Ratio
    confidence=support_AB/support_B
    
    return confidence

def lift_calculator(df, setA, setB):
    
    #### Numerator
    confidence=confidence_calculator(df,setA, setB)
    
    #### Denominator
    df=identity_creater(df)
    support_A=support_calculator(df,setA,"I")
    
    #### Ratio
    lift = confidence/support_A
    return lift
    
    

In [80]:
df.columns

Index(['transaction #', 'day week', 'vegetables', 'baby', 'fruit', 'milk',
       'dvds', 'meat'],
      dtype='object')

In [81]:
set_a=['meat']
set_b=['vegetables']

In [82]:
support_calculator(df,set_a, set_b)

0.15300546448087432

In [83]:
confidence_calculator(df, set_a, set_b)

0.25225225225225223

In [84]:
lift_calculator(df, set_a, set_b)

1.0117734172528692

In [85]:
X=['vegetables', 'baby', 'fruit', 'milk',
       'dvds', 'meat']

In [86]:
confidence_calculator(df, set_a,set_b)

0.25225225225225223

In [87]:
def set_acceptor(A, B, min_diff, unique):
    flag=True
    
        
    if len(A) >= len (B):
        diff=abs(len(set(A)-set(B)))
        commonalities = set(A) - (set(A) - set(B))

    else:
        diff=abs(len(set(B)-set(A)))
        commonalities = set(B) - (set(B) - set(A))

    
    
    if unique:
        if len(commonalities)>0:
            flag=False
            
    elif (len(A)>1) or (len(B)>1):
        if not diff>min_diff:
            flag= False
            
    else:
        if not diff>=2:
            flag= False
        
    return flag



def basket_combination_finder(X, setA_N, setB_N, min_diff, unique):
    from itertools import combinations
    setA_basket=list(combinations(X, setA_N))
    setB_basket=list(combinations(X, setB_N))

    comb_list=[]
    
    for A in setA_basket:
        for B in setB_basket:
            row=[A, B]
            if set_acceptor(A, B, min_diff, unique):
                comb_list.append(row)
    df_comb=pd.DataFrame(comb_list, columns=["A","B"])
    return df_comb




def master_basket_combination_finder(X, min_diff, unique):
    from itertools import permutations
    
    
    X_num_list=list(range(1, (len(X)+1) ))
    
    perm_list=list(permutations(X_num_list, 2))
    
    df_comb=pd.DataFrame(columns=["A","B"])
    
    for p in perm_list:
        dfc=basket_combination_finder(X, p[0], p[1], min_diff, unique)
        df_comb=df_comb.append(dfc)
    
    df_comb=df_comb.reset_index()
    return df_comb

In [94]:
def basket_calculator(df,X,confidence_level=.6,  min_diff=1, unique=True):
    
        
    dfc=master_basket_combination_finder(X, min_diff, unique)
    dfc["support"]=0.0
    dfc["confidence"]=0.0
    dfc["lift"]=0.0
    
    
    for i in range(len(dfc)):
        
        setA=dfc.iloc[i]["A"]
        setB=dfc.iloc[i]["B"]
        
        support=support_calculator(df, setA, setB)
        confidence=confidence_calculator(df, setA, setB)
        lift=lift_calculator(df, setA, setB)
        
        
        
        dfc.at[i, "support"]=float(support)
        dfc.at[i, "confidence"]=float(confidence)
        dfc.at[i, "lift"]=float(lift)
    
    dfc= dfc[ dfc["confidence"]>=confidence_level]
    dfc=dfc.sort_values(by="lift", ascending=False)
    return dfc

dfc=basket_calculator(df, X)
dfc

Unnamed: 0,index,A,B,support,confidence,lift
68,8,"(vegetables,)","(fruit, dvds, meat)",0.007172,0.677419,1.116827
150,0,"(vegetables,)","(baby, fruit, milk, dvds, meat)",0.001366,0.666667,1.099099
122,2,"(vegetables,)","(baby, fruit, dvds, meat)",0.003074,0.642857,1.059846
4,4,"(vegetables,)","(fruit, milk)",0.056694,0.626415,1.032738
121,1,"(vegetables,)","(baby, fruit, milk, meat)",0.00444,0.619048,1.020592
124,4,"(vegetables,)","(fruit, milk, dvds, meat)",0.002732,0.615385,1.014553
65,5,"(vegetables,)","(baby, dvds, meat)",0.010246,0.612245,1.009377
64,4,"(vegetables,)","(baby, milk, meat)",0.014003,0.602941,0.994038


In [None]:
dfc.to_csv("out.csv")

In [None]:
A=["vegetables"]

In [None]:
B=("baby", "dvds", "meat")

In [None]:
set(A)-set(B)