# Market Basket analysis using ranking

## Import Libraries

In [1]:
# Import modules that are used in this notebook. 
# Some additions would be there later on depending on when a need arises

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.pandas.set_option('display.max_columns',None)

## Read data

In [3]:
#read the original data file and understand the data
dataset_orig=pd.read_csv('Merged_clean.csv')
print(dataset_orig.shape)

(812914, 36)


## Get subset of only multi policy holders

In [4]:
#from original dataset get all those records where customers have more then one policy
df_multi = dataset_orig[dataset_orig['Freq'] > 1].sort_values(['policy_owner_number','RCD'])

## within each group give a rank order

In [5]:
df_multi['rank'] = df_multi.groupby(['policy_owner_number']).cumcount()+1
df_multi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109476 entries, 3 to 812911
Data columns (total 37 columns):
policy_owner_number       109476 non-null int64
policy_number             109476 non-null int64
RCD                       109476 non-null object
premium                   109476 non-null int64
afyp                      109476 non-null int64
sum_assured               109476 non-null int64
Owner_salary              108071 non-null float64
Marital_status            108859 non-null object
Own_Education             99288 non-null object
Own_Edu                   99288 non-null object
Own_gender                108859 non-null object
own_occupation            108735 non-null object
Occupation                108632 non-null object
Occupation_Group          108632 non-null object
billing_frequency         109476 non-null int64
ECS_flag                  109476 non-null object
PPT                       109476 non-null int64
risk_status               109476 non-null object
contract_type  

In [9]:
#display the data
print('\033[1m Product_Description \033[0m : \n',df_multi[df_multi['rank']==1]['Product_Description'].value_counts(normalize=True).head(5))
print('#########################')

print('\033[1m Product_brief_category \033[0m : \n',df_multi[df_multi['rank']==1]['Product_brief_category'].value_counts(normalize=True).head(5))
print('#########################')

print('\033[1m Product_Club_Manual \033[0m : \n',df_multi[df_multi['rank']==1]['Product_Club_Manual'].value_counts(normalize=True).head(5))
print('#########################')

print('\033[1m CUST_prod_cat \033[0m : \n',df_multi[df_multi['rank']==1]['CUST_prod_cat'].value_counts(normalize=True).head(5))
print('#########################')

print('\033[1m Par_NonPar \033[0m : \n',df_multi[df_multi['rank']==1]['Par_NonPar'].value_counts(normalize=True).head(5))
print('#########################')

[1m Product_Description [0m : 
 GURANTEED INCOME                 0.231679
NEW FULFILLING LIFE ANTI. W/L    0.157761
SECURED INCOME INSURANCE PLUS    0.104750
EXIDE SECURED INCOME INS - RP    0.064122
EXIDE STAR LIFE                  0.045632
Name: Product_Description, dtype: float64
#########################
[1m Product_brief_category [0m : 
 TRADITIONAL    0.874385
ULIP           0.050530
TERM           0.040522
PENSION        0.025042
HEALTH         0.008906
Name: Product_brief_category, dtype: float64
#########################
[1m Product_Club_Manual [0m : 
 GUARANTEED INCOME    0.231679
SECURED INCOME       0.180195
FULFILLING LIFE      0.157761
CREATING LIFE        0.093215
STAR LIFE            0.045632
Name: Product_Club_Manual, dtype: float64
#########################
[1m CUST_prod_cat [0m : 
 TRADITIONAL               0.689949
TRADITIONALULIP           0.091158
TRADITIONALTERM           0.069975
TRADITIONALPENSION        0.041539
TRADITIONALULIPPENSION    0.023113
Name:

## Find the customers who have bought the top first policy from the list

In [15]:
# so if first policy is guranteed income, get the counts of the second bought policy
cust_ids_Product_Description = df_multi[(df_multi['rank']==1)& 
                    (df_multi['Product_Description'] == 'GURANTEED INCOME')  ]['policy_owner_number']

df_multi_2_Product_Description = df_multi[(df_multi['rank']==2) &
                     (df_multi.policy_owner_number.isin(cust_ids_Product_Description))]


cust_ids_Product_brief_category = df_multi[(df_multi['rank']==1)& 
                    (df_multi['Product_brief_category'] == 'TRADITIONAL')  ]['policy_owner_number']

df_multi_2_Product_brief_category = df_multi[(df_multi['rank']==2) &
                     (df_multi.policy_owner_number.isin(cust_ids_Product_brief_category))]

cust_ids_Product_Club_Manual = df_multi[(df_multi['rank']==1)& 
                    (df_multi['Product_Club_Manual'] == 'GUARANTEED INCOME')  ]['policy_owner_number']
df_multi_2_Product_Club_Manual = df_multi[(df_multi['rank']==2) &
                     (df_multi.policy_owner_number.isin(cust_ids_Product_Club_Manual))]

cust_ids_CUST_prod_cat = df_multi[(df_multi['rank']==1)& 
                    (df_multi['CUST_prod_cat'] == 'TRADITIONAL')  ]['policy_owner_number']
df_multi_2_CUST_prod_cat = df_multi[(df_multi['rank']==2) &
                     (df_multi.policy_owner_number.isin(cust_ids_CUST_prod_cat))]

cust_ids_Par_NonPar = df_multi[(df_multi['rank']==1)& 
                    (df_multi['Par_NonPar'] == 'PAR')  ]['policy_owner_number']
df_multi_2_Par_NonPar = df_multi[(df_multi['rank']==2) &
                     (df_multi.policy_owner_number.isin(cust_ids_Par_NonPar))]

## Check the highest

In [16]:
print(df_multi_2_Product_Description['Product_Description'].value_counts(normalize=True).head(1))
print(df_multi_2_Product_brief_category['Product_brief_category'].value_counts(normalize=True).head(1))
print(df_multi_2_Product_Club_Manual['Product_Club_Manual'].value_counts(normalize=True).head(1))
print(df_multi_2_CUST_prod_cat['CUST_prod_cat'].value_counts(normalize=True).head(1))
print(df_multi_2_Par_NonPar['Par_NonPar'].value_counts(normalize=True).head(1))

GURANTEED INCOME    0.63143
Name: Product_Description, dtype: float64
TRADITIONAL    0.886289
Name: Product_brief_category, dtype: float64
GUARANTEED INCOME    0.63143
Name: Product_Club_Manual, dtype: float64
TRADITIONAL    1.0
Name: CUST_prod_cat, dtype: float64
PAR    0.686773
Name: Par_NonPar, dtype: float64


In [None]:
#    Product_Description       Product_brief_category   Product_Club_Manual        CUST_prod_cat       Par_NonPar
#1   GURANTEED INCOME(23.17%)  TRADITIONAL(87.44%)      GUARANTEED INCOME(23.17%)  TRADITIONAL(69%)    PAR(55.05%)
#2   GURANTEED INCOME(63.14%)  TRADITIONAL(88.63%)      GUARANTEED INCOME(63.14%)  TRADITIONAL(100%)   PAR(68.68%)