#Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("DataScienceChallenge_Data/DataScienceChallenge_Training.csv", sep = ",")

In [3]:
data.head()

Unnamed: 0,Cust_map,Merch_Map_final,TXN_MTH,SPND_CATGY,NumTrans
0,1,1,201309,HOUSEHOLD & UTILITIES,2
1,1,1,201310,HOUSEHOLD & UTILITIES,2
2,1,1,201311,HOUSEHOLD & UTILITIES,3
3,1,1,201312,HOUSEHOLD & UTILITIES,2
4,1,1,201401,HOUSEHOLD & UTILITIES,2


##Looking at one customer

In [3]:
data_1 = data[data['Cust_map']==1]
data_1.head()

Unnamed: 0,Cust_map,Merch_Map_final,TXN_MTH,SPND_CATGY,NumTrans
0,1,1,201309,HOUSEHOLD & UTILITIES,2
1,1,1,201310,HOUSEHOLD & UTILITIES,2
2,1,1,201311,HOUSEHOLD & UTILITIES,3
3,1,1,201312,HOUSEHOLD & UTILITIES,2
4,1,1,201401,HOUSEHOLD & UTILITIES,2


Only 12 months are included for each customer and the task is to predict the top 10 merchants a customer hasn't seen in the past 12 months that he probably will see.
For now, I'm combining the transactions across all 12 months because I think it won't make that much of a difference. If we change this, then we'll need to have 12 columns for each merchant in the dataset when we pivot. May increase computation by a lot.

In [4]:
data_1['TXN_MTH'].unique()

array([201309, 201310, 201311, 201312, 201401, 201402, 201403, 201404,
       201405, 201406, 201407, 201408])

groupby groups all the unique values in a column. sum() gives the sum of all values in other columns for each of the unique groups

In [5]:
data_1_groups = data_1.groupby('Merch_Map_final')
print data_1_groups.sum().head()

                 Cust_map  TXN_MTH  NumTrans
Merch_Map_final                             
1                      12  2416478        29
2                      11  2215070        85
3                      12  2416478        33
5                      10  2013759        42
9                      12  2416478       138


We sum all the transactions in each group (i.e. for each merchant) to get the number of transactions for each merchant across all 12 months for that customer

In [6]:
nTransactions= list(data_1_groups.sum()['NumTrans'])
print nTransactions

[29, 85, 33, 42, 138, 5, 8, 6, 7, 154, 5, 10, 9, 54, 8, 3, 5, 16, 5, 2, 3, 2, 22, 6, 3, 9, 10, 3, 3, 3, 11, 12, 3, 3, 2, 2, 3, 2, 3, 5, 2, 2, 3, 6, 3, 5, 5, 22, 2, 2, 3, 3, 14, 2, 2, 2, 3, 5, 7, 2, 15, 3, 3, 3, 2, 14, 3, 2, 2, 8, 18, 2, 5, 2, 3, 2, 34, 13, 3, 4, 4, 2, 5, 3, 3, 7, 2, 2, 3, 2, 2, 6, 2, 2, 2, 5, 8, 11, 2, 2, 2, 2, 3, 2, 3, 3, 2]


We get a list of the merchants the customer's seen

In [7]:
merchants = data_1['Merch_Map_final'].unique()
merchants

array([   1,    2,    3,    5,    9,   11,   13,   14,   17,   26,   27,
         28,   30,   31,   32,   33,   43,   47,   69,   79,  100,  101,
        103,  105,  110,  114,  116,  125,  131,  136,  145,  151,  161,
        181,  210,  225,  260,  271,  289,  448,  473,  496,  518,  531,
        586,  592,  649,  655,  660,  764,  774,  779,  783,  798,  833,
        934,  998, 1078, 1101, 1217, 1285, 1286, 1336, 1389, 1470, 1493,
       1527, 1553, 1625, 1778, 1800, 1808, 1942, 2118, 2188, 2207, 2321,
       2412, 2449, 2503, 2752, 2761, 2926, 2965, 3109, 3146, 3296, 3500,
       3620, 3790, 4076, 4688, 4727, 4871, 5230, 5399, 6010, 6023, 6265,
       6416, 7191, 7224, 7924, 8294, 8584, 8634, 8676])

In [54]:
print len(merchants)
print len(nTransactions)

107
107


In [8]:
customer = 1

Then, new dataframe

In [9]:
new_data = pd.DataFrame(columns=['Customer', 'Merchants', 'nTransactions'])

In [35]:
new_data['Merchants'] = merchants
new_data['nTransactions'] = nTransactions
new_data['Customer'] = customer
new_data

Unnamed: 0,Customer,Merchants,nTransactions
0,1,1,29
1,1,2,85
2,1,3,33
3,1,5,42
4,1,9,138
5,1,11,5
6,1,13,8
7,1,14,6
8,1,17,7
9,1,26,154


So, yeah, we'd need to do this for all customers.

In [24]:
customers = data['Cust_map'].astype(int).unique()
print len(customers)
customers = customers[:1000]
len(customers)

374328


1000

In [13]:
import sys
from multiprocessing.dummy import Pool as ThreadPool
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

def mapfunc(customer):
    data_customer = data[data['Cust_map']==customer]
    df_customer = pd.DataFrame(columns = ['Customer', 'Merchants', 'nTransactions'])
    df_customer['Merchants'] = data_customer['Merch_Map_final'].unique()
    df_customer['nTransactions'] = list(data_customer.groupby('Merch_Map_final').sum()['NumTrans'])
    df_customer['Customer'] = customer
    print customer
    sys.stdout.flush()
    return df_customer
'''
if __name__ == '__main__':
    jobs = []
    for i in range(5):
        p = multiprocessing.Process(target=worker, args=(i,))
        jobs.append(p)
        p.start()
per_customer = []'''

def reducefunc(per_customer):
    new_data = pd.concat(per_customer)

#per_customer = map(mapfunc, customers)
#with ThreadPoolExecutor(max_workers = 10) as pool:
#    futures = pool.map(mapfunc, customers)
pool = ThreadPool(10) 
per_customer = pool.map(mapfunc, customers)

126
226
1
26
51
76
151
176
101
201
227
127
2
77
27
52
228
177
152
102
202
128
3
28
78
53
103
4
229
153
129
203
178
29
54
79
104
230
5
154
179
204
130
30
55
80
105
231
6
155
205
180
31
81
56
131
106
232
7
156
206
32
107
181
57
132
82
233
8
207
157
58
108
33
83
133
182
234
208
158
9
59
109
34
84
183
134
235
209
159
10
60
110
35
85
210
236
11
111
135
184
36
160
61
86
211
237
136
12
112
62
238
37
212161185


87
137
13
38
113
213
88
63
162
239
186
138
14
39
114
163
214
187
89
240
64
139
15
115
40
164
215
90
188
241
65
140
16
165
116
41
216
189
91
141
117
242
166
17
42
66
217
190
92
142
118
243
43
167
218
67
18
93
143
191
119
219
44
244
68
168
19
94
192
144
220
120
45
245
69
169
95
20
193
145
121
221
46
246
70
96
170
194
21
146
122
47
247
222
97
71
171
22
195
147
123
48
248
98
172
223
72
124
23
196
148
49
224
173
249
24
99
73
125
197
50
149
225
174
250
251
25
100
74
276
301
150
198
326
175
252
376
351
75
302
277
199
401
377
253
327
426
451
352
303
200
378
278
254
402
427
328
452
353
304
379


In [16]:
new_data = pd.concat(per_customer)
new_data.sort('Customer')

Unnamed: 0,Customer,Merchants,nTransactions
0,1,1,29
79,1,2503,4
78,1,2449,3
77,1,2412,13
76,1,2321,34
75,1,2207,2
74,1,2188,3
73,1,2118,2
72,1,1942,5
71,1,1808,2


Making the user-item matrix to calculate distances from

In [18]:
customer_merchant_matrix = new_data.pivot(index='Customer', columns='Merchants', values = 'nTransactions')

In [20]:
customer_merchant_matrix.fillna(-1)

Merchants,1,2,3,4,5,6,7,8,9,10,...,9788,9791,9796,9803,9808,9809,9813,9816,9817,9822
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,29,85,33,-1,42,-1,-1,-1,138,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,-1,-1,-1,-1,-1,31,-1,-1,3,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,27,-1,31,-1,-1,-1,-1,-1,20,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
5,11,-1,-1,5,-1,27,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
6,-1,-1,-1,-1,-1,-1,-1,65,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7,53,28,-1,-1,79,-1,-1,-1,-1,3,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
8,-1,-1,-1,5,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
9,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
10,33,5,-1,5,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [21]:
from annoy import AnnoyIndex

In [93]:
max_trees = 3000
num_neighbors = 20

In [27]:
merchantIDs = customer_merchant_matrix.columns
print merchantIDs[3]

4


In [94]:
def find_nearest(matrix, customerIDs, merchantIDs):
    num_merchants = matrix.shape[1]
    print "Number of merchants " + str(num_merchants)
    num_customers = matrix.shape[0]
    print "Number of customers " + str(num_customers)
    ann = AnnoyIndex(num_merchants)
    for customer in customers:
        customer_vector = list(matrix.loc[[customer]])
        #print len(customer_vector)
        sys.stdout.flush()
        ann.add_item(customer, customer_vector)
        if customer%200 == 0:
            print 'Adding '+ str(customer)
    print "Building"
    #Accurate till max_trees number of items, approximates after that
    if num_merchants > max_trees:
        ann.build(max_trees)
    else:
        ann.build(num_merchants)
    print "...done"
    nearest = dict()
    for customer in customers:
        customerID = customer
        neighbors = ann.get_nns_by_item(customer, num_neighbors+1)
        if customer%200 == 0:
            print "Found neighbors for " + str(customer)   
        nearest[customerID] = []
        for i in neighbors:
                if i != customer:
                    nearest[customerID].append((i, ann.get_distance(i, customer)))
    return nearest

In [95]:
nearest = find_nearest(customer_merchant_matrix, customers, merchantIDs)
nearest

Number of merchants 6052
Number of customers 1000
Adding 200
Adding 400
Adding 600
Adding 800
Adding 1000
Building
...done
Found neighbors for 200
Found neighbors for 400
Found neighbors for 600
Found neighbors for 800
Found neighbors for 1000


{1: [(2, 1.2347651789923475e-08),
  (3, 1.6187991391802825e-08),
  (4, 1.6187991391802825e-08),
  (5, 1.2347651789923475e-08),
  (6, 1.2347651789923475e-08),
  (7, 1.2347651789923475e-08),
  (8, 1.2347651789923475e-08),
  (9, 1.2347651789923475e-08),
  (10, 1.2347651789923475e-08),
  (11, 1.6187991391802825e-08),
  (12, 1.6187991391802825e-08),
  (13, 1.2347651789923475e-08),
  (14, 1.2347651789923475e-08),
  (15, 1.2347651789923475e-08),
  (16, 1.2347651789923475e-08),
  (17, 1.2347651789923475e-08),
  (18, 1.2347651789923475e-08),
  (19, 1.6187991391802825e-08),
  (20, 1.6187991391802825e-08),
  (21, 1.2347651789923475e-08)],
 2: [(1, 1.2347651789923475e-08),
  (3, 1.6187991391802825e-08),
  (4, 1.6187991391802825e-08),
  (5, 1.2347651789923475e-08),
  (6, 1.2347651789923475e-08),
  (7, 1.2347651789923475e-08),
  (8, 1.2347651789923475e-08),
  (9, 1.2347651789923475e-08),
  (10, 1.2347651789923475e-08),
  (11, 1.6187991391802825e-08),
  (12, 1.6187991391802825e-08),
  (13, 1.23476517

In [77]:
customer_vector = customer_merchant_matrix.loc[1]
customer_vector[1].dtype

dtype('float64')

In [213]:
def recommend(matrix, customer, customerIDs, merchantIDs, nearest):
    recommendations = {}
    num_merchants = len(merchantIDs)
    #c1 = find_index(userIDs, person)
    #customer_vector = matrix.loc[[customer]]
    customer_vector = matrix.ix[customer]
    bool_customer_vector = pd.DataFrame(customer_vector)
    not_transacted = bool_customer_vector[bool_customer_vector[customer]==-1].index
    #bool_customer_vector = pd.DataFrame(customer_vector)
    #not_transacted = bool_customer_vector[bool_customer_vector[1]!=-1].index
    # determine the total distance
    totalDistance = 0.0
    for i in nearest[customer]:
        totalDistance += i[1]
    # Iterate through the k nearest neighbors
    # accumulating their ratings
    num=0
    for i in nearest[customer]:
        weight = i[1] / totalDistance
        neighbor = i[0]
        neighbor_vector = matrix.ix[neighbor]
        # now find items neighbor rated that customer didn't
        neighbor_table = pd.DataFrame(neighbor_vector)
        transacted = neighbor_table[neighbor_table[neighbor]>0].index
        for merchant in not_transacted:
            if (merchant in transacted):
                if merchant not in recommendations:
                    recommendations[merchant] = (neighbor_vector[merchant]
                                      * weight)
                else:
                    recommendations[merchant] = (recommendations[merchant]
                                      + neighbor_vector[merchant]
                                      * weight)
    #recommendations = list(recommendations.items())
    #Returns in decreasing order of score
    #recommendations.sort(key=lambda artistTuple: artistTuple[1],reverse = True)
    return recommendations

In [222]:

import operator
recos = recommend(customer_merchant_matrix, 1, customers, merchantIDs, nearest)
merch = [x[0] for x in sorted(recos.items(), key=operator.itemgetter(1), reverse=True)[:10]]
merch

0


[6, 4, 40, 16, 8, 34, 24, 64, 58, 15]

In [122]:
recos_1

{}