In [17]:
#############################
## Initialise
#############################

%pylab inline
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
from sklearn.cross_validation import KFold


#############################
## Parameters
#############################

# set size of neighbourhood for collaborative filter
n_neighbourhood = 100

# set number of folds for k-Fold xvalidation
n_folds = 5

# set number of iterations of k-Fold to perform
n_iter_kfold = 5


#############################
## Prepare data
#############################

# Import input data
df_spend = pd.read_csv('C:\\notbackedup\\recommender\\IR\\20160724 sample 100k 3plus.txt', delimiter='|')

# make the rowindex to be the customer number
df_spend = df_spend.set_index('CUST_ID_SRC_CUST_NO')

# drop irrelevant columns 
df_spend.drop(['AGE_IN_YRS','GNDR_CD_SRC','A_SPEND_CLOTHING','A_SPEND_DIGITAL','A_SPEND_GROCERIES',
                   'A_SPEND_MUSIC','A_SPEND_TRANSPORT','A_SPEND_TRAVEL'], axis=1, inplace=True)

## create lists of column names
l_cols_merchants = df_spend.columns

n_merchants = len(df_spend.columns)

# binarize the spend frequencies
## Prepare data
# convert merchant counts to binary flags
def binarise(x):
    if x>0: return 1
    else: return 0
    
df_spend = df_spend.applymap(binarise)

#############################
## Helper functions
#############################

# function to return index of top N values from a Numpy array
def topindexes(a,N): return np.argsort(a)[::-1][:N] 

#################################################################
## Functions for different methods of making recommendations
#################################################################

# Recommendations made at random
def f_benchmark_random(ay_test):
    return np.random.rand(ay_test.shape[0],ay_test.shape[1]).astype(float)


# Recommendations made in descending order of merchant popularity
def f_benchmark_popularity(ay_train, ay_test):
    # create table with frequency count of each merchant in train dataset
    k_freq_merchants = ay_train.sum(axis=0)

    # use frequencies as recommendations for each test data record
    recmatrix = np.tile(k_freq_merchants, ay_test.shape[0]).reshape(ay_test.shape[0],n_merchants)

    return recmatrix.astype(float)


# Recommendations using collaborative filtering
# This calculates recommendatoins using both summed AND averaged similiarities
# and returns a matrix for each one
def f_recommender(ay_train, ay_test, n_neighbours):

    ## calculate similarity matrix using Jaccard distance
    simmatrix = 1 - pairwise_distances(ay_train, ay_test, metric='jaccard')  
    
    # add a tiny random number to all values so that numpy can get a full k-sized neighbourhood
    simmatrix = simmatrix+np.random.random()/10000

    ## limit to recommendations from N nearest neighbours
    for i in range(simmatrix.shape[1]):    
        recs = simmatrix.T[i]
        indices = np.argpartition(-recs, n_neighbours)[:n_neighbours]
        mask = np.ones(recs.size,dtype=bool) 
        mask[indices] = False
        simmatrix.T[i][mask] = 0

    ## generate recommendations for SUM
    # weighted sum of merchant prensence/absence * similarity score
    recmatrix = np.dot(simmatrix.T, ay_train)
    
    # average ONLY across neighbourhood
    # to get count of merchants in neighbourhood
    # 1. convert simmatrix to boolean flags = flags only their k neighbours
    # 2. take dot product with ay_train
    simmatrix_bool = simmatrix.copy()
    simmatrix_bool[simmatrix>0]=1
    denominator = np.dot(simmatrix_bool.T, ay_train)

    recmatrix_avg = np.true_divide(recmatrix, denominator )

    # return recommendation scores for all test customers, for all merchants
    return recmatrix.astype(float), recmatrix_avg.astype(float)



Populating the interactive namespace from numpy and matplotlib


In [None]:
#################################################################
## Run k-Fold validation to assess accuracy of recommendations
#################################################################

# prepare empty lists to save results 
l_results_benchmark_random = []
l_results_benchmark_popularity = []
l_results_sum = []
l_results_avg = []

# For the parameterised # of iterations...
for n_iter in range(1, n_iter_kfold+1):
    print "Processing iteration", n_iter, "of", n_iter_kfold, "..."

    # ...run k-fold xvalidation and collate test statistics
    k_fold = KFold(len(df_spend), n_folds)     # random_state = 1

    for k, (train, test) in enumerate(k_fold):
        print "                            ... fold", k+1, "of", n_folds

        # convert dataframes to numpy arrays (for speed)
        ay_train = np.array(df_spend.iloc[train]) #df_train)
        ay_test = np.array(df_spend.iloc[test])  # df_test)

        # mask one randomly selected merchant from each record in test
        masklist = []
        for i in ay_test:
            indexnonzero = np.flatnonzero(i)
            # select one at random and add to a list of indices
            indexselect = np.random.choice(indexnonzero)
            masklist.append(indexselect)

        # mask those merchant entries with 0's
        ay_test = ay_test.copy()
        ay_test[range(ay_test.shape[0]),masklist]=0  


        ##### Benchmark: random #####
        # generate recommendations
        recmatrix = f_benchmark_random(ay_test)

        ## assess accuracy of benchmark recommendations
        # 1. drop recommendations for merchants they already shop at
        recmatrix[numpy.where(ay_test==1)]=np.nan

        # 2. In what % of cases is the masked merchant in the top 3 recommendations?
        top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
        n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
        prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
        l_results_benchmark_random.append(prop_correct_top3)

        ##### Benchmark: popularity #####
        # generate recommendations
        recmatrix = f_benchmark_popularity(ay_train, ay_test)

        ## assess accuracy of benchmark recommendations
        # 1. drop recommendations for merchants they already shop at
        recmatrix[numpy.where(ay_test==1)]=np.nan

        # 2. In what % of cases is the masked merchant in the top 3 recommendations?
        top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
        n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
        prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
        l_results_benchmark_popularity.append(prop_correct_top3)

        ##### COLLABORATIVE FILTER #####
        # generate recommendations for SUM and AVERAGE
        recmatrix_sum, recmatrix_avg = f_recommender(ay_train, ay_test, n_neighbourhood)

        ## assess accuracy of filter recommendations
        # 1. drop recommendations for merchants they already shop at
        recmatrix_sum[numpy.where(ay_test==1)]=np.nan
        recmatrix_avg[numpy.where(ay_test==1)]=np.nan

        ## Assess accuracy of both sets of recommendations
        # 1. for SUM: In what % of cases is the masked merchant in the top 3 recommendations?
        top3recs = np.argsort(-recmatrix_sum)[:,:3]   # select top 3 recs for each record
        n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
        prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
        l_results_sum.append(prop_correct_top3)

        # 2. for AVG: In what % of cases is the masked merchant in the top 3 recommendations?
        top3recs = np.argsort(-recmatrix_avg)[:,:3]   # select top 3 recs for each record
        n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
        prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
        l_results_avg.append(prop_correct_top3)

## Print results    
print "Overall mean results: prop. masked merchants in top 3 recommendations:"
print "Benchmark random: ", mean(l_results_benchmark_random)
print "Benchmark popularity: ", mean(l_results_benchmark_popularity)
print "Collaborative filter (sum method): ", mean(l_results_sum)
print "Collaborative filter (avg method): ", mean(l_results_avg)


Processing iteration 1 of 5 ...
                            ... fold 1 of 5
                            ... fold 2 of 5
                            ... fold 3 of 5
                            ... fold 4 of 5
                            ... fold 5 of 5
Processing iteration 2 of 5 ...
                            ... fold 1 of 5
                            ... fold 2 of 5
                            ... fold 3 of 5
                            ... fold 4 of 5
                            ... fold 5 of 5
Processing iteration 3 of 5 ...
                            ... fold 1 of 5
                            ... fold 2 of 5
                            ... fold 3 of 5
                            ... fold 4 of 5
                            ... fold 5 of 5
Processing iteration

In [18]:

## SPECIAL OVERNIGHT CODE

# BACKUP DF_SPEND
df_spend_master = df_spend.copy()

list_neighbourhoods = [10,20,35,50,100,500,1000,5000,10000]

for neighbourhood_value in list_neighbourhoods:
    n_neighbourhood = neighbourhood_value
    
    print "Processing neigbourhood", neighbourhood_value
    
    # prepare empty lists to save results 
    l_results_benchmark_random = []
    l_results_benchmark_popularity = []
    l_results_sum = []
    l_results_avg = []

    # For the parameterised # of iterations...
    for n_iter in range(1, n_iter_kfold+1):
        print "Processing iteration", n_iter, "of", n_iter_kfold, "..."

        df_spend = df_spend_master.sample(25000)

        # ...run k-fold xvalidation and collate test statistics
        k_fold = KFold(len(df_spend), n_folds)     # random_state = 1

        for k, (train, test) in enumerate(k_fold):
#            print "                            ... fold", k+1, "of", n_folds

            # convert dataframes to numpy arrays (for speed)
            ay_train = np.array(df_spend.iloc[train]) #df_train)
            ay_test = np.array(df_spend.iloc[test])  # df_test)

            # mask one randomly selected merchant from each record in test
            masklist = []
            for i in ay_test:
                indexnonzero = np.flatnonzero(i)
                # select one at random and add to a list of indices
                indexselect = np.random.choice(indexnonzero)
                masklist.append(indexselect)

            # mask those merchant entries with 0's
            ay_test = ay_test.copy()
            ay_test[range(ay_test.shape[0]),masklist]=0  


            ##### Benchmark: random #####
            # generate recommendations
            recmatrix = f_benchmark_random(ay_test)

            ## assess accuracy of benchmark recommendations
            # 1. drop recommendations for merchants they already shop at
            recmatrix[numpy.where(ay_test==1)]=np.nan

            # 2. In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_benchmark_random.append(prop_correct_top3)

            ##### Benchmark: popularity #####
            # generate recommendations
            recmatrix = f_benchmark_popularity(ay_train, ay_test)

            ## assess accuracy of benchmark recommendations
            # 1. drop recommendations for merchants they already shop at
            recmatrix[numpy.where(ay_test==1)]=np.nan

            # 2. In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_benchmark_popularity.append(prop_correct_top3)

            ##### COLLABORATIVE FILTER #####
            # generate recommendations for SUM and AVERAGE
            recmatrix_sum, recmatrix_avg = f_recommender(ay_train, ay_test, n_neighbourhood)

            ## assess accuracy of filter recommendations
            # 1. drop recommendations for merchants they already shop at
            recmatrix_sum[numpy.where(ay_test==1)]=np.nan
            recmatrix_avg[numpy.where(ay_test==1)]=np.nan

            ## Assess accuracy of both sets of recommendations
            # 1. for SUM: In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix_sum)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_sum.append(prop_correct_top3)

            # 2. for AVG: In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix_avg)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_avg.append(prop_correct_top3)

    ## Print results    
    print "Overall mean results: prop. masked merchants in top 3 recommendations:"
    print "Benchmark random: ", mean(l_results_benchmark_random)
    print "Benchmark popularity: ", mean(l_results_benchmark_popularity)
    print "Collaborative filter (sum method): ", mean(l_results_sum)
    print "Collaborative filter (avg method): ", mean(l_results_avg)

    text_file = open("Output.txt", "a")

    text_file.write("neighourhood: ")
    text_file.write(str(n_neighbourhood))
    text_file.write("\n")

    text_file.write("random: ")
    text_file.write(str(mean(l_results_benchmark_random)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_benchmark_random)))
    text_file.write("\n")

    text_file.write("popularity: ")
    text_file.write(str(mean(l_results_benchmark_popularity)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_benchmark_popularity)))
    text_file.write("\n")

    text_file.write("Collaborative filter (sum method): ")
    text_file.write(str(mean(l_results_sum)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_sum)))
    text_file.write("\n")

    text_file.write("Collaborative filter (avg method): ")
    text_file.write(str(mean(l_results_avg)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_avg)))
    text_file.write("\n")
    text_file.write("\n")
    text_file.write("\n")

    text_file.close()


Processing neigbourhood 10
Processing iteration 1 of 5 ...
Processing iteration 2 of 5 ...
Processing iteration 3 of 5 ...
Processing iteration 4 of 5 ...
Processing iteration 5 of 5 ...
Overall mean results: prop. masked merchants in top 3 recommendations:
Benchmark random:  0.024056
Benchmark popularity:  0.240616
Collaborative filter (sum method):  0.259168
Collaborative filter (avg method):  0.099784
Processing neigbourhood 20
Processing iteration 1 of 5 ...
Processing iteration 2 of 5 ...
Processing iteration 3 of 5 ...
Processing iteration 4 of 5 ...
Processing iteration 5 of 5 ...
Overall mean results: prop. masked merchants in top 3 recommendations:
Benchmark random:  0.02396
Benchmark popularity:  0.237632
Collaborative filter (sum method):  0.292624
Collaborative filter (avg method):  0.067448
Processing neigbourhood 35
Processing iteration 1 of 5 ...
Processing iteration 2 of 5 ...
Processing iteration 3 of 5 ...
Processing iteration 4 of 5 ...
Processing iteration 5 of 5 ..

In [None]:

## SPECIAL OVERNIGHT CODE

# BACKUP DF_SPEND

list_neighbourhoods = [20000]

for neighbourhood_value in list_neighbourhoods:
    n_neighbourhood = neighbourhood_value
    
    print "Processing neigbourhood", neighbourhood_value
    
    # prepare empty lists to save results 
    l_results_benchmark_random = []
    l_results_benchmark_popularity = []
    l_results_sum = []
    l_results_avg = []

    # For the parameterised # of iterations...
    for n_iter in range(1, n_iter_kfold+1):
        print "Processing iteration", n_iter, "of", n_iter_kfold, "..."

        df_spend = df_spend_master.sample(40000)

        # ...run k-fold xvalidation and collate test statistics
        k_fold = KFold(len(df_spend), n_folds)     # random_state = 1

        for k, (train, test) in enumerate(k_fold):
#            print "                            ... fold", k+1, "of", n_folds

            # convert dataframes to numpy arrays (for speed)
            ay_train = np.array(df_spend.iloc[train]) #df_train)
            ay_test = np.array(df_spend.iloc[test])  # df_test)

            # mask one randomly selected merchant from each record in test
            masklist = []
            for i in ay_test:
                indexnonzero = np.flatnonzero(i)
                # select one at random and add to a list of indices
                indexselect = np.random.choice(indexnonzero)
                masklist.append(indexselect)

            # mask those merchant entries with 0's
            ay_test = ay_test.copy()
            ay_test[range(ay_test.shape[0]),masklist]=0  


            ##### Benchmark: random #####
            # generate recommendations
            recmatrix = f_benchmark_random(ay_test)

            ## assess accuracy of benchmark recommendations
            # 1. drop recommendations for merchants they already shop at
            recmatrix[numpy.where(ay_test==1)]=np.nan

            # 2. In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_benchmark_random.append(prop_correct_top3)

            ##### Benchmark: popularity #####
            # generate recommendations
            recmatrix = f_benchmark_popularity(ay_train, ay_test)

            ## assess accuracy of benchmark recommendations
            # 1. drop recommendations for merchants they already shop at
            recmatrix[numpy.where(ay_test==1)]=np.nan

            # 2. In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_benchmark_popularity.append(prop_correct_top3)

            ##### COLLABORATIVE FILTER #####
            # generate recommendations for SUM and AVERAGE
            recmatrix_sum, recmatrix_avg = f_recommender(ay_train, ay_test, n_neighbourhood)

            ## assess accuracy of filter recommendations
            # 1. drop recommendations for merchants they already shop at
            recmatrix_sum[numpy.where(ay_test==1)]=np.nan
            recmatrix_avg[numpy.where(ay_test==1)]=np.nan

            ## Assess accuracy of both sets of recommendations
            # 1. for SUM: In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix_sum)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_sum.append(prop_correct_top3)

            # 2. for AVG: In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix_avg)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_avg.append(prop_correct_top3)

    ## Print results    
    print "Overall mean results: prop. masked merchants in top 3 recommendations:"
    print "Benchmark random: ", mean(l_results_benchmark_random)
    print "Benchmark popularity: ", mean(l_results_benchmark_popularity)
    print "Collaborative filter (sum method): ", mean(l_results_sum)
    print "Collaborative filter (avg method): ", mean(l_results_avg)

    text_file = open("Output.txt", "a")

    text_file.write("neighourhood: ")
    text_file.write(str(n_neighbourhood))
    text_file.write("\n")

    text_file.write("random: ")
    text_file.write(str(mean(l_results_benchmark_random)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_benchmark_random)))
    text_file.write("\n")

    text_file.write("popularity: ")
    text_file.write(str(mean(l_results_benchmark_popularity)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_benchmark_popularity)))
    text_file.write("\n")

    text_file.write("Collaborative filter (sum method): ")
    text_file.write(str(mean(l_results_sum)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_sum)))
    text_file.write("\n")

    text_file.write("Collaborative filter (avg method): ")
    text_file.write(str(mean(l_results_avg)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_avg)))
    text_file.write("\n")
    text_file.write("\n")
    text_file.write("\n")

    text_file.close()


In [None]:

# set number of iterations of k-Fold to perform
n_iter_kfold = 2

list_neighbourhoods = [50000]

for neighbourhood_value in list_neighbourhoods:
    n_neighbourhood = neighbourhood_value
    
    print "Processing neigbourhood", neighbourhood_value
    
    # prepare empty lists to save results 
    l_results_benchmark_random = []
    l_results_benchmark_popularity = []
    l_results_sum = []
    l_results_avg = []

    # For the parameterised # of iterations...
    for n_iter in range(1, n_iter_kfold+1):
        print "Processing iteration", n_iter, "of", n_iter_kfold, "..."

        df_spend = df_spend_master.sample(80000)

        # ...run k-fold xvalidation and collate test statistics
        k_fold = KFold(len(df_spend), n_folds)     # random_state = 1

        for k, (train, test) in enumerate(k_fold):
#            print "                            ... fold", k+1, "of", n_folds

            # convert dataframes to numpy arrays (for speed)
            ay_train = np.array(df_spend.iloc[train]) #df_train)
            ay_test = np.array(df_spend.iloc[test])  # df_test)

            # mask one randomly selected merchant from each record in test
            masklist = []
            for i in ay_test:
                indexnonzero = np.flatnonzero(i)
                # select one at random and add to a list of indices
                indexselect = np.random.choice(indexnonzero)
                masklist.append(indexselect)

            # mask those merchant entries with 0's
            ay_test = ay_test.copy()
            ay_test[range(ay_test.shape[0]),masklist]=0  


            ##### Benchmark: random #####
            # generate recommendations
            recmatrix = f_benchmark_random(ay_test)

            ## assess accuracy of benchmark recommendations
            # 1. drop recommendations for merchants they already shop at
            recmatrix[numpy.where(ay_test==1)]=np.nan

            # 2. In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_benchmark_random.append(prop_correct_top3)

            ##### Benchmark: popularity #####
            # generate recommendations
            recmatrix = f_benchmark_popularity(ay_train, ay_test)

            ## assess accuracy of benchmark recommendations
            # 1. drop recommendations for merchants they already shop at
            recmatrix[numpy.where(ay_test==1)]=np.nan

            # 2. In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_benchmark_popularity.append(prop_correct_top3)

            ##### COLLABORATIVE FILTER #####
            # generate recommendations for SUM and AVERAGE
            recmatrix_sum, recmatrix_avg = f_recommender(ay_train, ay_test, n_neighbourhood)

            ## assess accuracy of filter recommendations
            # 1. drop recommendations for merchants they already shop at
            recmatrix_sum[numpy.where(ay_test==1)]=np.nan
            recmatrix_avg[numpy.where(ay_test==1)]=np.nan

            ## Assess accuracy of both sets of recommendations
            # 1. for SUM: In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix_sum)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_sum.append(prop_correct_top3)

            # 2. for AVG: In what % of cases is the masked merchant in the top 3 recommendations?
            top3recs = np.argsort(-recmatrix_avg)[:,:3]   # select top 3 recs for each record
            n_correct_top3 = sum([masklist[i] in top3recs[i] for i in range(len(masklist))])
            prop_correct_top3 = np.true_divide(n_correct_top3, len(ay_test))
            l_results_avg.append(prop_correct_top3)

    ## Print results    
    print "Overall mean results: prop. masked merchants in top 3 recommendations:"
    print "Benchmark random: ", mean(l_results_benchmark_random)
    print "Benchmark popularity: ", mean(l_results_benchmark_popularity)
    print "Collaborative filter (sum method): ", mean(l_results_sum)
    print "Collaborative filter (avg method): ", mean(l_results_avg)

    text_file = open("Output.txt", "a")

    text_file.write("neighourhood: ")
    text_file.write(str(n_neighbourhood))
    text_file.write("\n")

    text_file.write("random: ")
    text_file.write(str(mean(l_results_benchmark_random)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_benchmark_random)))
    text_file.write("\n")

    text_file.write("popularity: ")
    text_file.write(str(mean(l_results_benchmark_popularity)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_benchmark_popularity)))
    text_file.write("\n")

    text_file.write("Collaborative filter (sum method): ")
    text_file.write(str(mean(l_results_sum)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_sum)))
    text_file.write("\n")

    text_file.write("Collaborative filter (avg method): ")
    text_file.write(str(mean(l_results_avg)))
    text_file.write("  -- SD:  ")
    text_file.write(str(std(l_results_avg)))
    text_file.write("\n")
    text_file.write("\n")
    text_file.write("\n")

    text_file.close()
