In [1]:
%pylab inline
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
from sklearn.cross_validation import KFold

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Import input data
df_spend = pd.read_csv('C:\\notbackedup\\recommender\\IR\\20160724 sample 100k 3plus.txt', delimiter='|')

# make the rowindex to be the customer number
df_spend = df_spend.set_index('CUST_ID_SRC_CUST_NO')

# drop irrelevant columns 
df_spend.drop(['AGE_IN_YRS','GNDR_CD_SRC','A_SPEND_CLOTHING','A_SPEND_DIGITAL','A_SPEND_GROCERIES',
                   'A_SPEND_MUSIC','A_SPEND_TRANSPORT','A_SPEND_TRAVEL'], axis=1, inplace=True)

## create lists of column names
l_cols_merchants = df_spend.columns

n_merchants = len(df_spend.columns)

# binarize the spend frequencies
## Prepare data
# convert merchant counts to binary flags
def binarise(x):
    if x>0: return 1
    else: return 0
    
df_spend = df_spend.applymap(binarise)


In [None]:
df_spend.head()

## Implementation 

In [None]:
## Define helper function(s)

# function to return index of top N values from a Numpy array
def topindexes(a,N): return np.argsort(a)[::-1][:N] 


### Benchmarks

1. Recommend merchants at random
2. Recommend merchants in descending order of popularity

In [None]:
# define function that generates recommendations using random approach
def f_benchmark_random(ay_test):

    return np.random.rand(ay_test.shape[0],ay_test.shape[1]).astype(float)


In [None]:
# define function that generates recommendations using popularity approach
def f_benchmark_popularity(ay_train, ay_test):
    # create table with frequency count of each merchant in train dataset
    k_freq_merchants = ay_train.sum(axis=0)

    # use frequencies as recommendations for each test data record
    recmatrix = np.tile(k_freq_merchants, ay_test.shape[0]).reshape(ay_test.shape[0],n_merchants)

    return recmatrix.astype(float)


In [None]:
#### # Run 5-fold x-validation for benchmarks and measure accuracy

# prepare empty lists to save results
l_results_benchmark_random = []
l_results_benchmark_popularity = []

# run 5-fold xvalidation and collate test statistics
k_fold = KFold(len(df_spend), 5, random_state = 1)

for k, (train, test) in enumerate(k_fold):
    print "Processing fold", k, "..."
    
    # convert dataframes to numpy arrays (for speed)
    ay_train = np.array(df_spend.iloc[train])
    ay_test = np.array(df_spend.iloc[test]) 

    # mask one randomly selected merchant from each record in test
    masklist = []
    for i in ay_test:
        indexnonzero = np.flatnonzero(i)
        # select one at random and add to a list of indices
        indexselect = np.random.choice(indexnonzero)
        masklist.append(indexselect)

    # mask those merchant entries with 0's
    ay_test = ay_test.copy()
    ay_test[range(ay_test.shape[0]),masklist]=0  

    ##### Benchmark: random #####
    # generate recommendations
    recmatrix = f_benchmark_random(ay_test)

    ## assess accuracy of benchmark recommendations
    # 1. drop recommendations for merchants they already shop at
    recmatrix[numpy.where(ay_test==1)]=np.nan

    # 2. In what % of cases is the masked merchant in the top 3 recommendations?
    top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
    n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
    prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
    l_results_benchmark_random.append(prop_correct_top3)

    ##### Benchmark: popularity #####
    # generate recommendations
    recmatrix = f_benchmark_popularity(ay_train, ay_test)

    ## assess accuracy of benchmark recommendations
    # 1. drop recommendations for merchants they already shop at
    recmatrix[numpy.where(ay_test==1)]=np.nan

    # 2. In what % of cases is the masked merchant in the top 3 recommendations?
    top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
    n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
    prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
    l_results_benchmark_popularity.append(prop_correct_top3)
    
    
## Print results    
print "Overall results: prop. masked merchants in top 3 recommendations:"
print "Benchmark random: ", mean(l_results_benchmark_random)
print "Benchmark popularity: ", mean(l_results_benchmark_popularity)


### Collaborative filter 

In [3]:
# Collaborative filter function
# This calculates recommendatoins using both summed AND averaged similiarities
# and returns a matrix for each one

def f_recommender(ay_train, ay_test, n_neighbours):

    ## calculate similarity matrix using Jaccard distance
    simmatrix = 1 - pairwise_distances(ay_train, ay_test, metric='jaccard')  
    
    # add a tiny random number to all values so that numpy can get a full k-sized neighbourhood
    simmatrix = simmatrix+np.random.random()/10000

    ## limit to recommendations from N nearest neighbours
    for i in range(simmatrix.shape[1]):    
        recs = simmatrix.T[i]
        indices = np.argpartition(-recs, n_neighbours)[:n_neighbours]
        mask = np.ones(recs.size,dtype=bool) 
        mask[indices] = False
        simmatrix.T[i][mask] = 0

    ## generate recommendations for SUM
    # weighted sum of merchant prensence/absence * similarity score
    recmatrix = np.dot(simmatrix.T, ay_train)
    
    # average ONLY across neighbourhood
    # to get count of merchants in neighbourhood
    # 1. convert simmatrix to boolean flags = flags only their k neighbours
    # 2. take dot product with ay_train
    simmatrix_bool = simmatrix.copy()
    simmatrix_bool[simmatrix>0]=1
    denominator = np.dot(simmatrix_bool.T, ay_train)

    recmatrix_avg = np.true_divide(recmatrix, denominator )

    # return recommendation scores for all test customers, for all merchants
    return recmatrix.astype(float), recmatrix_avg.astype(float)


In [7]:
# train-test implementation

# set size of neighbourhood for collaborative filter
n_neighbourhood = 500

# prepare empty lists to save results
l_results_sum = []
l_results_avg = []

# run 5-fold xvalidation and collate test statistics
k_fold = KFold(len(df_spend), 5, random_state = 1)

for k, (train, test) in enumerate(k_fold):
    print "Processing fold", k, "..."
    
    # convert dataframes to numpy arrays (for speed)
    ay_train = np.array(df_spend.iloc[train]) #df_train)
    ay_test = np.array(df_spend.iloc[test])  # df_test)

    # mask one randomly selected merchant from each record in test
    masklist = []
    for i in ay_test:
        indexnonzero = np.flatnonzero(i)
        # select one at random and add to a list of indices
        indexselect = np.random.choice(indexnonzero)
        masklist.append(indexselect)

    # mask those merchant entries with 0's
    ay_test = ay_test.copy()
    ay_test[range(ay_test.shape[0]),masklist]=0  


    # generate recommendations for SUM and AVERAGE
    recmatrix_sum, recmatrix_avg = f_recommender(ay_train, ay_test, n_neighbourhood)

    ## assess accuracy of filter recommendations
    # 1. drop recommendations for merchants they already shop at
    recmatrix_sum[numpy.where(ay_test==1)]=np.nan
    recmatrix_avg[numpy.where(ay_test==1)]=np.nan
  
    ## Assess accuracy of both sets of recommendations
    # 1. for SUM: In what % of cases is the masked merchant in the top 3 recommendations?
    top3recs = np.argsort(-recmatrix_sum)[:,:3]   # select top 3 recs for each record
    n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
    prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
    l_results_sum.append(prop_correct_top3)

    # 2. for AVG: In what % of cases is the masked merchant in the top 3 recommendations?
    top3recs = np.argsort(-recmatrix_avg)[:,:3]   # select top 3 recs for each record
    n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
    prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
    l_results_avg.append(prop_correct_top3)

## Print results    
print "Overall results: prop. masked merchants in top 3 recommendations:"
print "Benchmark random: ", mean(l_results_sum)
print "Benchmark popularity: ", mean(l_results_avg)


Processing fold 0 ...
Processing fold 1 ...
Processing fold 2 ...
Processing fold 3 ...
Processing fold 4 ...
Overall results: prop. masked merchants in top 3 recommendations:
Benchmark random:  0.33301
Benchmark popularity:  0.00341


500

Benchmark random:  0.33301
Benchmark popularity:  0.00341



## Charts and statistics

In [None]:
# % of customers at each merhcant
pd.set_option('display.max_rows', 500)
print df_spend.describe().ix[1]  # just means

In [None]:
#### Distributions of merchants and of recommendations
# Compare distributions of 
# [1] merchant penetration of overall sample
# [2] merchants in sum method recommendations (k=100)
# [3] merchants in average method recommendations (k=20000)

# randomly split 80-20% into two datasets
from sklearn.cross_validation import train_test_split
ay_train, ay_test = train_test_split(df_spend, test_size=0.2, random_state=1)

## raw frequency of merchants in overall sample
merch_count = pd.DataFrame(df_spend.sum())    

## sum, k100
# generate recommendations 
recmatrix_sum, ignore = f_recommender(ay_train, ay_test, 100)
recmatrix_sum[numpy.where(ay_test==1)]=np.nan
top3recs_sum = np.argsort(-recmatrix_sum)[:,:3]   # select top 3 recs for each record

## avg, k=20k
ignore, recmatrix_avg = f_recommender(ay_train, ay_test, 20000)
recmatrix_avg[numpy.where(ay_test==1)]=np.nan
top3recs_avg = np.argsort(-recmatrix_avg)[:,:3]   # select top 3 recs for each record

### Combine all into a pandas dataframe
a = array(merch_count.index[top3recs_sum.flatten()])
unique, counts = numpy.unique(a, return_counts=True)
b = dict(zip(unique, counts))
c = pd.DataFrame.from_dict(b,orient='index')

a = array(merch_count.index[top3recs_avg.flatten()])
unique, counts = numpy.unique(a, return_counts=True)
b = dict(zip(unique, counts))
c2 = pd.DataFrame.from_dict(b,orient='index')

merch_count['series'] = 1
c['series'] = 2
c2['series'] = 3

dfmerge = pd.concat([merch_count,c,c2], axis=0)
dfmerge.columns = ['k','series']


In [None]:
# export for Tableau
dfmerge.to_csv('C:\\notbackedup\\recommender\\IR\\plot.csv')

## Generate bespoke recommendations

In [21]:
# Specify merchants which hypothetical customer shops at
#list_merchant_input = ['F_WOTIF','F_QANTAS','F_AIRBNB'] 
#list_merchant_input = ['F_NETFLIX'] 
#list_merchant_input = ['F_RED_ROOSTER']
#list_merchant_input = ['F_BABY_BUNTING','F_PUMPKIN_PATCH'] 
list_merchant_input = ['F_TICKETMASTER'] 

print "Input merchants: ", list_merchant_input


## Create a dummy record to use as test dataset
tdf = df_spend.iloc[0] 
tdf = tdf * 0  # reset all values to 0
# input some values for selected merchants
tdf[list_merchant_input] = 1
# duplicate so we get a matrix of two identical customers
tdf = pd.concat([tdf, tdf], axis=1)
ay_train = array(df_spend)
ay_test = array(tdf).T


## generate recommendations for three different k
# for sum, k=500
n_neighbourhood = 500
recmatrix_sum, recmatrix_avg = f_recommender(ay_train, ay_test, n_neighbourhood)

# drop recommendations for merchants they already shop at
recmatrix_sum[numpy.where(matrix(ay_test)==1)]=np.nan
recmatrix_avg[numpy.where(matrix(ay_test)==1)]=np.nan

# get top 3 & print merchants
top3recs = np.argsort(-recmatrix_sum)[:,:10]   
print "Top recommendations from sum approach, k=500: ", df_spend.columns[top3recs][0]


# for average, k=10
n_neighbourhood = 10
recmatrix_sum, recmatrix_avg = f_recommender(ay_train, ay_test, n_neighbourhood)

# drop recommendations for merchants they already shop at
recmatrix_sum[numpy.where(matrix(ay_test)==1)]=np.nan
recmatrix_avg[numpy.where(matrix(ay_test)==1)]=np.nan

# get top 3 & print merchants
top3recs = np.argsort(-recmatrix_avg)[:,:10]   
print "Top recommendations from average approach, k=10: ", df_spend.columns[top3recs][0]


# for average, k=200k
n_neighbourhood = 20000
recmatrix_sum, recmatrix_avg = f_recommender(ay_train, ay_test, n_neighbourhood)

# drop recommendations for merchants they already shop at
recmatrix_sum[numpy.where(matrix(ay_test)==1)]=np.nan
recmatrix_avg[numpy.where(matrix(ay_test)==1)]=np.nan

# get top 3 & print merchants
top3recs = np.argsort(-recmatrix_avg)[:,:10]   
print "Top recommendations from average approach, k=20k: ", df_spend.columns[top3recs][0]





Input merchants:  ['F_TICKETMASTER']
Top recommendations from sum approach, k=500:  ['F_BIG_W' 'F_TELSTRA' 'F_BUNNINGS' 'F_KMART' 'F_CALTEX' 'F_PAYPAL'
 'F_MYER' 'F_TARGET' 'F_MCDONALDS' 'F_DAN_MURPHY']
Top recommendations from average approach, k=10:  ['F_TAB' 'F_TICKETEK' 'F_QANTAS' 'F_BIG_W' 'F_JETSTAR' 'F_VILLAGE_CINEMAS'
 'F_BUNNINGS' 'F_TELSTRA' 'F_OPTUS' 'F_DAN_MURPHY']
Top recommendations from average approach, k=20k:  ['F_TICKETEK' 'F_AGODA_COM' 'F_WOTIF' 'F_KOBO' 'F_LASTMINUTE_COM'
 'F_LITENEASY' 'F_BOOKDEPOSITORY' 'F_ARBONNE' 'F_GROUPON' 'F_SPOTIFY']


### Merchant correlation matrix


In [None]:
## calculate similarity matrix using Jaccard distance
simmatrix = 1 - pairwise_distances(df_spend.T, df_spend.T, metric='jaccard')  

sdf = pd.DataFrame(simmatrix, index=df_spend.columns, columns=df_spend.columns)
sdf = sdf.fillna(0)

plotdf = sdf 


In [None]:
# set same-merchant intersections to 0 so not shown
# this is all intersections where rownum = column num
for i in range(len(plotdf)):
    plotdf.iat[i,i]=0
corrdf_plot = plotdf 
#corrdf_plot[corrdf_plot>0.4]=0.4

# Adjust so that each row is proportional to biggest correlation
#corrdf_plot = (plotdf.divide(plotdf.max(axis=0)).fillna(0))
#corrdf_plot.fillna(0)

# Plot it out
fig, ax = plt.subplots()
heatmap = ax.pcolor(corrdf_plot, cmap=plt.cm.Blues, alpha=0.8)#, vmax=0.3)

# Format
fig = plt.gcf()
fig.set_size_inches(30, 30)

# turn off the frame
ax.set_frame_on(False)

# put the major ticks at the middle of each cell
ax.set_yticks(np.arange(corrdf_plot.shape[0]) + 0.5, minor=False)
ax.set_xticks(np.arange(corrdf_plot.shape[1]) + 0.5, minor=False)

# want a more natural, table-like display
ax.invert_yaxis()
ax.xaxis.tick_top()

# Set the labels

# label source:https://en.wikipedia.org/wiki/Basketball_statistics
labels = sdf.index

# note I could have used nba_sort.columns but made "labels" instead
ax.set_xticklabels(labels, minor=False)
ax.set_yticklabels(corrdf_plot.index, minor=False)

# rotate the
plt.xticks(rotation=90)

ax.grid(False)

# Turn off all the ticks
ax = plt.gca()

for t in ax.xaxis.get_major_ticks():
    t.tick1On = False
    t.tick2On = False
for t in ax.yaxis.get_major_ticks():
    t.tick1On = False
    t.tick2On = False

In [None]:
# This with log scale

from matplotlib.colors import LogNorm

# set same-merchant intersections to 0 so not shown
# this is all intersections where rownum = column num
for i in range(len(plotdf)):
    plotdf.iat[i,i]=0
corrdf_plot = plotdf + 0.1  # for log scale
# Adjust so that each row is proportional to biggest correlation
#corrdf_plot = (plotdf.divide(plotdf.max(axis=0)).fillna(0))
#corrdf_plot.fillna(0)

# Plot it out
fig, ax = plt.subplots()
heatmap = ax.pcolor(corrdf_plot, norm=LogNorm(vmin=corrdf_plot.values.min(), vmax=corrdf_plot.values.max()), cmap=plt.cm.Blues, alpha=0.8)

#fig.colorbar()
#plt.pcolor(X, Y, Z1, norm=LogNorm(vmin=Z1.min(), vmax=Z1.max()), cmap='PuBu_r')


# Format
fig = plt.gcf()
fig.set_size_inches(30, 30)

# turn off the frame
ax.set_frame_on(False)

# put the major ticks at the middle of each cell
ax.set_yticks(np.arange(corrdf_plot.shape[0]) + 0.5, minor=False)
ax.set_xticks(np.arange(corrdf_plot.shape[1]) + 0.5, minor=False)

# want a more natural, table-like display
ax.invert_yaxis()
ax.xaxis.tick_top()

# Set the labels

# label source:https://en.wikipedia.org/wiki/Basketball_statistics
labels = sdf.index

# note I could have used nba_sort.columns but made "labels" instead
ax.set_xticklabels(labels, minor=False)
ax.set_yticklabels(corrdf_plot.index, minor=False)

# rotate the
plt.xticks(rotation=90)

ax.grid(False)

# Turn off all the ticks
ax = plt.gca()

for t in ax.xaxis.get_major_ticks():
    t.tick1On = False
    t.tick2On = False
for t in ax.yaxis.get_major_ticks():
    t.tick1On = False
    t.tick2On = False
    
    


In [None]:
## THIS VERSION SCALED BY ROW


# set same-merchant intersections to 0 so not shown
# this is all intersections where rownum = column num
for i in range(len(plotdf)):
    plotdf.iat[i,i]=0
# Adjust so that each row is proportional to biggest correlation
corrdf_plot = (plotdf.divide(plotdf.max(axis=0)).fillna(0))
#corrdf_plot.fillna(0)

# Plot it out
fig, ax = plt.subplots()
heatmap = ax.pcolor(corrdf_plot, cmap=plt.cm.Blues, alpha=0.8)

# Format
fig = plt.gcf()
fig.set_size_inches(30, 30)

# turn off the frame
ax.set_frame_on(False)

# put the major ticks at the middle of each cell
ax.set_yticks(np.arange(corrdf_plot.shape[0]) + 0.5, minor=False)
ax.set_xticks(np.arange(corrdf_plot.shape[1]) + 0.5, minor=False)

# want a more natural, table-like display
ax.invert_yaxis()
ax.xaxis.tick_top()

# Set the labels

# label source:https://en.wikipedia.org/wiki/Basketball_statistics
labels = sdf.index

# note I could have used nba_sort.columns but made "labels" instead
ax.set_xticklabels(labels, minor=False)
ax.set_yticklabels(corrdf_plot.index, minor=False)

# rotate the
plt.xticks(rotation=90)

ax.grid(False)

# Turn off all the ticks
ax = plt.gca()

for t in ax.xaxis.get_major_ticks():
    t.tick1On = False
    t.tick2On = False
for t in ax.yaxis.get_major_ticks():
    t.tick1On = False
    t.tick2On = False