    Since purchase prediction can either be only one or zero, I've treated this as a classification problem. Simplest way was to populate train.json as bought and some other random pairs as not bought. For each such user and item, I took percentile value using logic similar to baseline code by enumerating most popular elements. I used regression/classification on these vectors formed by these [bought or not] v/s [user and item rank] and used predict() method to get which class testing data will belong to.
    
    Here I found SGD classifier to be sufficiently fast and enough to get a top 10 entry in Kaggle. I have used 500,000 more random pairs as not bought along with 1,000,000 bought training entires.
    
    REF http://scikit-learn.org/stable/modules/sgd.html

In [1]:
import numpy as np
from collections import defaultdict

import random
import math
import time
import warnings

from sklearn import linear_model
from __future__ import division
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import hamming_loss



In [2]:
#from baseline

def readJson(f):
  for l in open(f):
    yield eval(l)


In [3]:
#read training data - accessed multiple times

train = []

for l in readJson("train.json"):
    train.append(l)


In [31]:

#parse lines from training file into dictionaries
training = []
start = time.time()

userset = set([])
itemset = set([])
trainset = set([])

for l in train:
    """
    All bought items
    """
    training.append((l['reviewerID']+"-"+l['asin'], 1))
    
    userset.add(l['reviewerID'])
    itemset.add(l['asin'])
    trainset.add((l['reviewerID'],l['asin']))
    

userlist = list(userset)
itemlist = list(itemset)
"""
Get more unbought data
"""
resize = 1.5*len(training)
while len(training) < resize:
    u = userlist[random.randrange(0, len(userlist))]
    i = itemlist[random.randrange(0, len(itemlist))]
    
    if (u,i) not in trainset:
        trainset.add((u,i))
        training.append((u+"-"+i, 0))

print "Done creating", time.time() - start

Done creating 5.44251012802


    Populate users and items in reverse sorted order which can be later used to calculate percentiles for both and then append to X of regression model.

In [32]:
#same as baseline - popularity to get %le

itemCount = defaultdict(int)
userCount = defaultdict(int)
print training[0]
for (ui,bought) in training:
    userid,itemid = ui.strip().split('-')
    
    if itemid not in itemCount:
        itemCount[itemid]=0
    if userid not in userCount:
        userCount[userid]=0  
    
    if bought==1:
        userCount[userid] += 1
        itemCount[itemid] += 1


mostPopularUsers=[]
mostPopularUsers = [(userCount[x], x) for x in userCount]
mostPopularUsers.sort()
mostPopularUsers.reverse()
    
mostPopularItems=[]
mostPopularItems = [(itemCount[x], x) for x in itemCount]
mostPopularItems.sort()
mostPopularItems.reverse()

print "Done", len(mostPopularItems)
print len(mostPopularUsers), len(mostPopularItems)

('bc19970fff3383b2fe947cf9a3a5d7b13b6e57ef2cd53abc52bb2dfedf5fb1cd-a6ed402934e3c1138111dce09256538afb04c566edf37c16b9ba099d23afb764', 1)
Done 171185
509678 171185


In [33]:
#rankings wrt %le - use count and increment percentile by 0.05

ItemPerc = defaultdict(list)
UserPerc = defaultdict(list)

percentile = 0.05
for i in range(0,len(mostPopularItems)):
    (count, item)= mostPopularItems[i]
    if i>len(mostPopularItems)*percentile:
        percentile+=.05
    ItemPerc[item]=percentile
print percentile

percentile = 0.05
for i in range(0,len(mostPopularUsers)):
    (count, user)= mostPopularUsers[i]
    if i>len(mostPopularUsers)*percentile:
        percentile+=.05
    UserPerc[user]=percentile
    
print percentile
print "Done itemrank and userrank list", len(ItemPerc), len(UserPerc)

1.0
1.0
Done itemrank and userrank list 171185 509678


In [34]:
#logical reg vectors create X and y

X=[]
y=[]

print training[0]
for (ui, bought) in training:
    u,i = ui.strip().split('-')
    elem=[ItemPerc[i],UserPerc[u]]
    
    y.append(bought)
    X.append(elem)


('bc19970fff3383b2fe947cf9a3a5d7b13b6e57ef2cd53abc52bb2dfedf5fb1cd-a6ed402934e3c1138111dce09256538afb04c566edf37c16b9ba099d23afb764', 1)


In [11]:
#logical reg
logreg = linear_model.LogisticRegression()
logreg.fit(X, y) #UNCOMMENT ME
print logreg.coef_

[[-4.4219724  -1.95474662]]


    Use Huber loss with epsilon=0.5 gave best results

In [35]:
#USE ME

from sklearn import linear_model

sgd = linear_model.SGDClassifier(alpha=0.001, n_iter=5, epsilon=0.6, loss="huber")
sgd.fit(X, y)

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.6,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='huber', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

Predict: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor.predict



In [36]:
#predict using regression 
warnings.filterwarnings("ignore", category=DeprecationWarning) 

predfile = open("CLFpredictions_Purchase.txt", 'w')
predfile.write("reviewerID-asin,prediction\n")

for l in open("pairs_Purchase.txt"):
    if l.startswith("reviewer"):
        continue
        
    u,i = l.strip().split('-')
    uP = UserPerc[u]
    iP = ItemPerc[i]
    
    if uP == []:
        uP=1
    if iP == []:
        iP = 1
                
    warnings.filterwarnings("ignore", category=DeprecationWarning) 

    if (sgd.predict([uP, iP])==[0]):
        predfile.write(u + '-' + i + ",0\n")
    else:
        predfile.write(u + '-' + i + ",1\n")

predfile.close()

In [None]:
#takes a lot of time
from sklearn.svm import SVC
clf = SVC()
#clf.fit(X, y)  #UNCOMMENT ME