# Market Basket analysis using Python generators

## Import Libraries

In [28]:
import os
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display

#Function to return size of object
def size(obj):
    return "{0:2f} MB".format(sys.getsizeof(obj)/(1000*1000))

## Read data

In [29]:
orders=pd.read_csv('Merged_clean.csv')

## Read contents

In [30]:
print('Shape of file containing all records: {0}' .format(orders.shape))
orders=orders.loc[orders['Freq']>1]

print('orders with clients buying more than one policy --> dimensions: {0}; size:{1}'.format(orders.shape,size(orders)))
display(orders.head())

Shape of file containing all records: (812914, 36)
orders with clients buying more than one policy --> dimensions: (109476, 36); size:193.680366 MB


Unnamed: 0,policy_owner_number,policy_number,RCD,premium,afyp,sum_assured,Owner_salary,Marital_status,Own_Education,Own_Edu,...,Policy_term,LA_gender,LA_DOB,channel_flag,Occ_Profile,Par_NonPar,DSTNAME,STATNAME,City_classification,Freq
2,50010334,3187975,2015-09-10,6000,72000,720000,600000.0,M,GR,GRADUATE,...,20,M,1974-02-12,TIED,BUSINESS MAN,ULIP,NORTH WEST DELHI,DELHI,METRO,2
3,50010334,3089455,2015-03-27,4946,59350,700000,600000.0,M,GR,GRADUATE,...,45,M,1974-02-12,TIED,BUSINESS MAN,PAR,NORTH WEST DELHI,DELHI,METRO,2
4,50010396,3142060,2015-07-22,6135,12270,122403,500000.0,W,PG,POST GRADUATE,...,30,F,1990-08-20,TIED,BUSINESS MAN,NON-PAR,GURGAON,HARYANA,CLASS A,2
5,50010396,2799813,2013-10-28,2083,24994,161000,500000.0,W,PG,POST GRADUATE,...,20,F,1964-12-04,TIED,BUSINESS MAN,NON-PAR,GURGAON,HARYANA,CLASS A,2
9,50010838,3204128,2015-09-30,10000,120000,0,1300000.0,M,GR,GRADUATE,...,10,F,1954-11-05,TIED,OTHERS,PENSION,BENGALURU,KARNATAKA,METRO,2


## Run test with 5 columns related to policies

In [31]:
#Convert from dataframe to series with policy_owner_number as index and Product club manual as value
orders_prod_desc=orders.set_index('policy_owner_number')['Product_Description'].rename('item_id')

orders_prod_club_manual=orders.set_index('policy_owner_number')['Product_Club_Manual'].rename('item_id')

orders_prod_brief_cat=orders.set_index('policy_owner_number')['Product_brief_category'].rename('item_id')

orders_cust_prod_cat=orders.set_index('policy_owner_number')['CUST_prod_cat'].rename('item_id')

orders_par_nonpar=orders.set_index('policy_owner_number')['Par_NonPar'].rename('item_id')

In [32]:
#display the data
print('\033[1m Product_Description \033[0m : \n', orders_prod_desc.head())
print('#########################')

print('\033[1m Product_Club_Manual \033[0m : \n', orders_prod_club_manual.head())
print('#########################')

print('\033[1m Product_brief_category \033[0m : \n', orders_prod_brief_cat.head())
print('#########################')

print('\033[1m CUST_prod_cat \033[0m : \n', orders_cust_prod_cat.head())
print('#########################')

print('\033[1m Par_NonPar \033[0m : \n', orders_par_nonpar.head())
print('#########################')

[1m Product_Description [0m : 
 policy_owner_number
50010334                    WEALTH MAXIMA
50010334    NEW FULFILLING LIFE ANTI. W/L
50010396                 GURANTEED INCOME
50010396                 GURANTEED INCOME
50010838                     GOLDEN YEARS
Name: item_id, dtype: object
#########################
[1m Product_Club_Manual [0m : 
 policy_owner_number
50010334        WEALTH MAXIMA
50010334      FULFILLING LIFE
50010396    GUARANTEED INCOME
50010396    GUARANTEED INCOME
50010838         GOLDEN YEARS
Name: item_id, dtype: object
#########################
[1m Product_brief_category [0m : 
 policy_owner_number
50010334           ULIP
50010334    TRADITIONAL
50010396    TRADITIONAL
50010396    TRADITIONAL
50010838        PENSION
Name: item_id, dtype: object
#########################
[1m CUST_prod_cat [0m : 
 policy_owner_number
50010334    TRADITIONALULIPTERM
50010334    TRADITIONALULIPTERM
50010396        TRADITIONALTERM
50010396        TRADITIONALTERM
50010838     T

## Function definitions used in the algorithm

In [33]:
# returns frequency count for item and item pairs
def freq(iterable):
    if type(iterable)== pd.core.series.Series:
        return iterable.value_counts().rename('freq')
    else:
        return pd.Series(Counter(iterable)).rename("freq")


# return number of unique orders
def order_count(order_item):
    return len(set(order_item.index))


# generate item pair one at a time
def get_item_pairs(order_item):
    order_item=order_item.reset_index().as_matrix()
    for policy_owner_number, order_object in groupby(order_item, lambda x:x[0]):
        item_list=[item[1] for item in order_object]
        for item_pair in combinations(item_list,2):
            yield item_pair

#return frequency and support for each item
def merge_item_stats(item_pairs,item_stats):
    return (item_pairs
                 .merge(item_stats.rename(columns={'freq':'freqA','support':'supportA'}),left_on='item_A',right_index=True)
                 .merge(item_stats.rename(columns={'freq':'freqB','support':'supportB'}),left_on='item_B',right_index=True))

#returns name associated with item
def merge_item_name(rules,item_name):
    columns=['itemA','itemB','freqAB','supportAB','freqA','supportA','freqB','supportB','confidenceAtoB','confidenceBtoA','lift']
    rules=(rules.merge(item_name.rename(columns={'item_name':'itemA'}),left_on='item_A',right_on='item_id')
         .merge(item_name.rename(columns={'item_name':'itemB'}),left_on='item_B',right_on='item_id'))
    return rules[columns]

## Python generators algorithm

In [34]:
#
def association_rules(order_item, min_support):
    print("starting order_item:{:22d}".format(len(order_item)))
    item_stats=freq(order_item).to_frame("freq")
    item_stats['support']=item_stats['freq']/order_count(order_item)*100
    
    #calculate item frequency and support
    qualifying_items = item_stats[item_stats['support'] >=min_support].index
    order_item = order_item[order_item.isin(qualifying_items)]

    #filter from order_item those with minimum support
    print("Items with support >= {}:{:15d}".format(min_support, len(qualifying_items)))
    print("remaining order_item: {:21d}".format(len(order_item)))
    
    #filter less those with less than 2 item size
    order_size = freq(order_item.index)
    qualifying_orders = order_size[order_size>=2].index
    order_item = order_item[order_item.index.isin(qualifying_orders)]
    
    print("remaining orders with 2+ items: {:11d}".format(len(qualifying_orders)))
    print("remaining order_item: {:21d}".format(len(order_item)))
    
    #calculate frequenct and min support
    item_stats = freq(order_item).to_frame("freq")
    item_stats['support'] = item_stats['freq'] /order_count(order_item) * 100
    
#item pair generator
    item_pair_gen = get_item_pairs(order_item)
    
    #eliminate if criteria not found
    item_pairs = freq(item_pair_gen).to_frame("freqAB")
    item_pairs['supportAB'] = item_pairs['freqAB'] / len(qualifying_orders) * 100
    
    print("item pairs: {:31d}".format(len(item_pairs)))
    
    
    item_pairs = item_pairs[item_pairs['supportAB'] >= min_support]
    print("item pairs with support > = {}: {:10d} \n".format(min_support, len(item_pairs)))
    
    item_pairs = item_pairs.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    item_pairs = merge_item_stats(item_pairs, item_stats)
    
    #create table
    item_pairs['confidenceAtoB'] = item_pairs['supportAB'] / item_pairs['supportA']
    item_pairs['confidenceBtoA'] = item_pairs['supportAB'] / item_pairs['supportB']
    item_pairs['lift'] = item_pairs['supportAB'] / (item_pairs['supportA'] * item_pairs['supportB'])
    return item_pairs.sort_values('lift', ascending=False)
        

In [35]:
%%time
rules_prod_desc = association_rules(orders_prod_desc, 0.01)

rules_prod_club_manual = association_rules(orders_prod_club_manual, 0.01)

rules_prod_brief_cat = association_rules(orders_prod_brief_cat, 0.01)

rules_cust_prod_cat = association_rules(orders_cust_prod_cat, 0.01)

rules_par_nonpar = association_rules(orders_par_nonpar, 0.01)

starting order_item:                109476
Items with support >= 0.01:             44
remaining order_item:                109470
remaining orders with 2+ items:       47156
remaining order_item:                109466


  app.launch_new_instance()


item pairs:                            1143
item pairs with support > = 0.01:        748 

starting order_item:                109476
Items with support >= 0.01:             26
remaining order_item:                109476
remaining orders with 2+ items:       47160
remaining order_item:                109476
item pairs:                             467
item pairs with support > = 0.01:        364 

starting order_item:                109476
Items with support >= 0.01:              6
remaining order_item:                109476
remaining orders with 2+ items:       47160
remaining order_item:                109476
item pairs:                              31
item pairs with support > = 0.01:         28 

starting order_item:                109476
Items with support >= 0.01:             30
remaining order_item:                109473
remaining orders with 2+ items:       47159
remaining order_item:                109473
item pairs:                              30
item pairs with support > = 0

## Print the rules

In [27]:
print('\033[1m CUST_prod_cat \033[0m : \n', rules_prod_desc)
print('#########################')

print('\033[1m CUST_prod_cat \033[0m : \n', rules_prod_club_manual)
print('#########################')

print('\033[1m CUST_prod_cat \033[0m : \n', rules_prod_brief_cat)
print('#########################')

print('\033[1m CUST_prod_cat \033[0m : \n', rules_cust_prod_cat)
print('#########################')

print('\033[1m CUST_prod_cat \033[0m : \n', rules_par_nonpar)
print('#########################')

[1m CUST_prod_cat [0m : 
                              item_A                          item_B  freqAB  \
77             EXIDE LIFE TERM PLAN            EXIDE LIFE TERM PLAN       6   
365           EXIDE UTTAM JEEVAN SP           EXIDE UTTAM JEEVAN SP       6   
570        EXIDE PROSPERING LIFE SP        EXIDE PROSPERING LIFE SP      18   
521        ASSURED GAIN PLUS SINGLE        ASSURED GAIN PLUS SINGLE      17   
718  EXIDE TERM LIFE PLUS - REGULAR  EXIDE TERM LIFE PLUS - REGULAR      23   
..                              ...                             ...     ...   
418    EXIDE MY TERM INSURANCE - RP           INCOME ADVANTAGE PLAN       8   
442  REASSURING LIFE ENDOWMENT PLAN           INCOME ADVANTAGE PLAN       8   
594  REASSURING LIFE ENDOWMENT PLAN          NEW CREATING LIFE - RP       5   
320           INCOME ADVANTAGE PLAN  REASSURING LIFE ENDOWMENT PLAN       7   
710  CREATING LIFE CHILD PROTECTION           INCOME ADVANTAGE PLAN       5   

     supportAB  freqA  