# Import Libraries

In [None]:
#libraries
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Read data

**dataset used Merged_clean_and_dropped: Merged cleaned, continuous variables imputed with mean, rest recordes with NaN dropped**
**NOTE: data munging is done later on, for isolating rows with target variable = 1**

In [None]:
#read data
df_merged = pd.read_csv('Merged_clean_and_dropped.csv')

#descreptive stats for reference
pd.options.display.float_format = "{:.2f}".format
with pd.option_context('display.max_columns', None):
    display(df_merged.head())
    display(df_merged.describe())
    display(df_merged.shape)
    display(df_merged.info())

# Dataset preparation for apriori

In [None]:
#preparing dataset for mlextend

#clean
df_merged = df_merged[df_merged['target'] == 1]
df_merged['Product_Description'] = df_merged['Product_Description'].str.strip()
df_merged['Product_brief_category'] = df_merged['Product_brief_category'].str.strip()
df_merged['Product_Club_Manual'] = df_merged['Product_Club_Manual'].str.strip()
df_merged['CUST_prod_cat'] = df_merged['CUST_prod_cat'].str.strip()
df_merged['Par_NonPar'] = df_merged['Par_NonPar'].str.strip()
df_merged.dropna(axis = 0, subset = ['policy_owner_number'], inplace = True)
df_merged['policy_owner_number'] = df_merged['policy_owner_number'].astype('str')
df_merged.replace({'target': 0}, 1, inplace = True)


In [None]:
#rank
df_merged['rank'] = df_merged.groupby(['policy_owner_number']).cumcount()+1

In [None]:
#display majority

print('\033[1m Product_Description \033[0m : \n',df_merged[df_merged['rank']==1]['Product_Description'].value_counts(normalize=True).head(5))
print('#########################')

print('\033[1m Product_brief_category \033[0m : \n',df_merged[df_merged['rank']==1]['Product_brief_category'].value_counts(normalize=True).head(5))
print('#########################')

print('\033[1m Product_Club_Manual \033[0m : \n',df_merged[df_merged['rank']==1]['Product_Club_Manual'].value_counts(normalize=True).head(5))
print('#########################')

print('\033[1m CUST_prod_cat \033[0m : \n',df_merged[df_merged['rank']==1]['CUST_prod_cat'].value_counts(normalize=True).head(5))
print('#########################')

print('\033[1m Par_NonPar \033[0m : \n',df_merged[df_merged['rank']==1]['Par_NonPar'].value_counts(normalize=True).head(5))
print('#########################')

In [None]:
#create seperate baskets (Datasets) for every product related variable
basket_desc = (df_merged
          .groupby(['policy_owner_number', 'Product_Description'])['target']
          .sum().unstack().reset_index().fillna(0)
          .set_index('policy_owner_number'))

basket_pcm = (df_merged
          .groupby(['policy_owner_number', 'Product_Club_Manual'])['target']
          .sum().unstack().reset_index().fillna(0)
          .set_index('policy_owner_number'))

basket_pbc = (df_merged
          .groupby(['policy_owner_number', 'Product_brief_category'])['target']
          .sum().unstack().reset_index().fillna(0)
          .set_index('policy_owner_number'))

basket_cpc = (df_merged
          .groupby(['policy_owner_number', 'CUST_prod_cat'])['target']
          .sum().unstack().reset_index().fillna(0)
          .set_index('policy_owner_number'))

basket_pnp = (df_merged
          .groupby(['policy_owner_number', 'Par_NonPar'])['target']
          .sum().unstack().reset_index().fillna(0)
          .set_index('policy_owner_number'))

In [None]:
#displaying dataframes for reference
display(basket_desc.head())
display(basket_pcm.head())
display(basket_pbc.head())
display(basket_cpc.head())
display(basket_pnp.head())

In [None]:
# convert values to integers 0 or 1
def encode_units(x):
    if x <= 0:
        return 0
    if x>= 1:
        return 1
    
basket_set_desc = basket_desc.applymap(encode_units)
basket_set_pcm = basket_pcm.applymap(encode_units)
basket_set_pbc = basket_pbc.applymap(encode_units)
basket_set_cpc = basket_cpc.applymap(encode_units)
basket_set_pnp = basket_pnp.applymap(encode_units)


In [None]:
basket_set_desc

# Market Basket Analysis

#### 1. Product Description

In [None]:
basket_set_desc.shape

In [None]:


#doesn't generate association_rules unless min_support reduced to 0.03

freqlist1 = apriori(basket_set_desc, min_support = 0.0004, max_len= None, verbose = 0, use_colnames = True, low_memory= False)

display(freqlist1.head())

In [None]:
rules1 = association_rules(freqlist1, metric = 'lift', min_threshold = 1)
rules1.sort_values('lift', ascending = False)

In [None]:
#display rules for top 2 categories. Since they account for most percentage of data
display(rules1[rules1['antecedents'] == {'GURANTEED INCOME'}])
display(rules1[rules1['antecedents'] == {'NEW FULFILLING LIFE ANTI. W/L'}])

#### 2. Product Club Manual

In [None]:


#doesn't generate association_rules unless min_support reduced to 0.03

freqlist2 = apriori(basket_set_pcm, min_support = 0.0005, max_len= None, verbose = 0, use_colnames = True, low_memory= False)

display(freqlist2.head())

In [None]:
rules2 = association_rules(freqlist2, metric = 'lift', min_threshold = 1)
rules2.sort_values('lift', ascending = False)

In [None]:
#display rules for top 2 categories. Since they account for most percentage of data
with pd.option_context('display.max_columns', None, 'display.width', None, 'display.max_colwidth', -1):
    display(rules2[rules2['antecedents'] == {'GUARANTEED INCOME'}])
    display(rules2[rules2['antecedents'] == {'SECURED INCOME'}])
    display(rules2[rules2['antecedents'] == {'FULFILLING LIFE'}])
    display(rules2[rules2['antecedents'] == {'CREATING LIFE'}])
    display(rules2[rules2['antecedents'] == {'STAR LIFE'}])

#### 3. Product Brief Category 

In [None]:


#doesn't generate association_rules till min_support is reduced to 0.01

freqlist3 = apriori(basket_set_pbc, min_support = 0.003, max_len= None, verbose = 0, use_colnames = True, low_memory= False)

display(freqlist3.head())

In [None]:
rules3 = association_rules(freqlist3, metric = 'lift', min_threshold = 1)
rules3.sort_values('lift', ascending = False)

In [None]:
#display rules for top 2 categories. Since they account for most percentage of data
display(rules1[rules1['antecedents'] == {'TRADITIONAL'}])
display(rules1[rules1['antecedents'] == {'ULIP'}])

#### 4. CUST_prod_cat

In [None]:


# doesn't generate association_rules for any value of min_support

freqlist4 = apriori(basket_set_cpc, min_support = 0.000000001, max_len= None, verbose = 0, use_colnames = True, low_memory= False)

display(freqlist4)

In [None]:
rules4 = association_rules(freqlist4, metric = 'lift', min_threshold = 1)
rules4.sort_values('lift', ascending = False)

#### 5. Par Non_par

In [None]:


# doesn't generate association_rules unless min_support reduced to 0.01

freqlist5 = apriori(basket_set_pnp, min_support = 0.003, max_len= None, verbose = 0, use_colnames = True, low_memory= False)

display(freqlist5.head())

In [None]:
rules5 = association_rules(freqlist5, metric = 'lift', min_threshold = 1)
rules5.sort_values('lift', ascending = False)

In [None]:
#display rules for top 2 categories. Since they account for most percentage of data
display(rules1[rules1['antecedents'] == {'PAR'}])
display(rules1[rules1['antecedents'] == {'NON-PAR'}])