In [4]:
import gzip
import random
import matplotlib.pyplot as plt
import string
import numpy
from sklearn import svm
from sklearn.model_selection import train_test_split
from collections import defaultdict

### Purchase

#### Assumptions
Users will buy items from the same category of items they already bought

Users will buy items with average rating bigger than the average of items in the category

Users will buy popular items in the category they already bought

Users will buy items that have smaller price than the average of the category


#### Splitting data

In [5]:
def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

In [6]:
trainTotal = []
for l in readGz("train.json.gz"):
    trainTotal.append(l)

In [7]:
train = trainTotal[:100000]

In [36]:
train = trainTotal

#### Getting Users and Items features

In [8]:
items = defaultdict(list)
usersItems = defaultdict(list)
for v in train:
    user = v['reviewerID']
    itemID = v['itemID']
    usersItems[user].append(itemID)
    if(items[itemID] == []):
        if('price' in v):
            items[itemID] = [1, v['rating'], v['categoryID'], v['price']]
        else:
            items[itemID] = [1, v['rating'], v['categoryID'], -1]
    else:
        if(items[itemID][3] == -1 and 'price' in v):
            items[itemID][3] = v['price']
        items[itemID][0] += 1
        items[itemID][1] = (items[itemID][1]*(items[itemID][0]-1) + v['rating'] )/items[itemID][0]

In [9]:
usersFeatures = defaultdict(list)
for user in usersItems:
    avgPrice = []
    avgRating = []
    #getting number of items bought per category
    uCats = [0,0,0,0,0]
    for i in [items[x] for x in usersItems[user]]:
        itemRating = i[1]
        itemCat = i[2]
        itemPrice = i[3]
        
        avgRating.append(itemRating)
        uCats[itemCat] += 1
        if itemPrice != -1:
            avgPrice.append(itemPrice)
            
    avgRating = sum(avgRating)/len(avgRating)
    if(avgPrice != []):
        avgPrice = sum(avgPrice)/len(avgPrice)
    else:
        avgPrice = -1
    
    usersFeatures[user] = [len(usersItems[user]), avgRating, uCats, avgPrice]

In [10]:
# overall data
avgRating = sum([v['rating'] for v in train])/len(train)
avgPrice = sum([v['price'] for v in train if 'price' in v])/len([v['price'] for v in train if 'price' in v])

In [11]:
avgPurchases = 0
for i in items:
    avgPurchases += items[i][0]
avgPurchases /= len(items)

In [12]:
mostPopular = []
for item in items:
    itemPurchases = items[item][0]
    mostPopular.append([item, itemPurchases])
mostPopular = sorted(mostPopular, key=lambda x:-x[1])

In [13]:
catFeats = []
for i in range(0,5):
    feats = []
    for v in [x for x in train if x['categoryID'] == i]:
        price = v['price'] if 'price' in v else -1
        feats.append([v['rating'], price, v['itemID']])
    itemsInCat = list(set([v[2] for v in feats]))
    avgPrice = sum([v[1] for v in feats])/len(feats)
    avgRating = sum([v[0] for v in feats])/len(feats)
    purchases = len(feats)
    catFeats.append([purchases, avgRating, avgPrice, len(itemsInCat)])

In [14]:
multCatUsers = []
for v in usersFeatures:
    if(v != 0):
        countNon0 = 0
        for i in range(0,5):
            if usersFeatures[v][2][i] != 0:
                countNon0 += 1
        if countNon0 > 1:
            multCatUsers.append(v)

In [15]:
avgUserPurchases = sum([usersFeatures[u][0] for u in usersFeatures])/len([v[0] for v in usersFeatures])

In [16]:
max([usersFeatures[u][0] for u in usersFeatures])
purchases = [0]*108
for v in [usersFeatures[u][0] for u in usersFeatures]:
    purchases[v-1] += 1
    

#### Predicting

In [14]:
notPurchased = []
i = 0
while i < 200000:
    user = random.randint(0, 199999)
    product = random.randint(0, 199999)
    v = {'reviewerID' : 0,
        'itemID' : 0}
    v['reviewerID'] = trainTotal[user]['reviewerID']
    v['itemID'] = trainTotal[product]['itemID']
    if v['itemID'] not in usersItems[v['reviewerID']]:
        notPurchased.append(v)
        i += 1

In [15]:
validation = trainTotal + notPurchased

In [189]:
def pred(user, item):
    if items[item] == []:
        return 0
    else:
        numPurchases = items[item][0]
        itemRating = items[item][1]
        itemCat = items[item][2]
        itemPrice = items[item][3]
        
        if usersItems[user] == []:
            if itemRating > avgRating:
                return 1
            elif itemCat == 0 and numPurchases > avgPurchases:
                return 1
            else:
                return 0
        else:
            userData = usersFeatures[user]
            countCats = 0
            for j in range(0,5):
                if userData[2][j] != 0:
                    countCats += 1
                    
                    
            if numPurchases < 0.75*avgPurchases:
                return 0
            elif userData[2][itemCat] != 0:
                return 1
            #elif itemPrice > catFeats[itemCat][2]:
             #   return 0
            #elif itemRating < 0.75*catFeats[itemCat][1]:
             #   return 0
            #elif itemPrice > 2*avgPrice:
            #    return 0
            ##elif itemRating < 0.6*avgRating:
              #  return 0
            #elif userData[2][itemCat] != 0:
                #return 1
     #       elif itemPrice > 1.25*avgPrice:
    #            return 0
   #         elif numPurchases < 0.5*avgPurchases:
  #              return 0
 #           elif userData[2][itemCat] > 1.0*sum(userData[2])/len(userData[2]):
#                return 1
            #elif itemRating >= userData[1] and itemPrice <= userData[3]:
             #   return 1
            #elif mostPopular.index([item, items[item][0]]) < len(mostPopular)/20:
             #   return 1
            else:
                return 0
                

In [190]:
trainPreds = []
for v in train:
    user = v['reviewerID']
    item = v['itemID']
    trainPreds.append(pred(user, item))
    
print("Train Accuracy: " + str(sum(trainPreds)/len(train)))

Train Accuracy: 0.78943


In [191]:
validationPreds = []
validationY = []

for i in range(0, 200000):
    validationY.append(1 - i//100000)
for v in validation:
    user = v['reviewerID']
    item = v['itemID']
    validationPreds.append(pred(user, item))
    
accuracy = []
TP = 0
TN = 0
FP = 0
FN = 0
for i in range(0, len(validationY)):
    if validationPreds[i] == 1 and validationY[i] == 1:
        TP += 1
    elif validationPreds[i] == 1 and validationY[i] == 0:
        FP += 1
    elif validationPreds[i] == 0 and validationY[i] == 1:
        FN += 1
    elif validationPreds[i] == 0 and validationY[i] == 0:
        TN += 1
    accuracy.append(validationPreds[i] == validationY[i])
accuracy = sum(accuracy)/len(accuracy)
print("Validation Accuracy: " + str(accuracy))
print("TP = " + str(TP))
print("FP = " + str(FP))
print("FN = " + str(FN))
print("TN = " + str(TN))

Validation Accuracy: 0.52176
TP = 60259
FP = 55907
FN = 39741
TN = 44093


In [309]:
predictions = open("predictions_Purchase.txt", 'w')
for l in open("pairs_Purchase.txt"):
    if l.startswith("reviewerID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')

    predictions.write(u + '-' + i + "," + str(pred(u, i)) + "\n")
    

predictions.close()

#### SVM

In [16]:
mostPopularItems = [v[0] for v in mostPopular[:len(mostPopular)//2]]

In [17]:
X = []
y = []
for v in validation:
    user = v['reviewerID']
    item = v['itemID']
    #userFeatures[user] = [userPurchases, avgRating, [userCats], avgPrice]
    userData = usersFeatures[user]
    isPopular = item in mostPopularItems
    #items[item] = [itemPurchases, avgRating, itemCat, itemPrice]
    if item in items and items[item] != []:
        nPurchs = items[item][0]
        itemCategory = items[item][2]
    else:
        nPurchs = 0
        itemCategory = 0
    X.append([itemCategory, isPopular, nPurchs])
    y.append(1 if len(v) != 2 else 0)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

In [30]:
clf = svm.LinearSVC(C=10)
clf.fit(X_train, y_train)
train_predictions = clf.predict(X_train) 
test_predictions = clf.predict(X_test)

In [31]:
trainAcc = []
testAcc = []
for i in range(0, len(train_predictions)):
    trainAcc.append(train_predictions[i] == y_train[i])
    testAcc.append(test_predictions[i] == y_test[i])
trainAcc = sum(trainAcc)/len(trainAcc)
testAcc = sum(testAcc)/len(testAcc)

In [32]:
print('Train Accuracy: ' + str(trainAcc))
print('Test Accuracy: ' + str(testAcc))

Train Accuracy: 0.498425
Test Accuracy: 0.501575


In [35]:
len([x for x in train_predictions if x == 1])

200000

In [57]:
X_test = []
for l in open("pairs_Purchase.txt"):
    if l.startswith("reviewerID"):
        continue
    u,i = l.strip().split('-')

    userData = usersFeatures[user]
    X_test.append(items[item] + [userData[0]] + [userData[1]] + [userData[3]])
    
y_test = clf.predict(X_test)

predictions = open("predictions_Purchase.txt", 'w')
index = 0
for l in open("pairs_Purchase.txt"):
    if l.startswith("reviewerID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    
    predictions.write(u + '-' + i + "," + str(y_test[index]) + "\n")
    index += 1

predictions.close()

#### Linear Regression

In [None]:
theta,residuals,rank,s = numpy.linalg.lstsq(X_train, y_train)
train_predictions = X_train*theta
accuracy = []
for i in range(0, len(train_predictions)):
    accuracy.append(train_predictions[i] == y_train[i])
print(sum(accuracy)/len(accuracy))

test_predictions = X_test*theta
accuracy = []
for i in range(0, len(test_predictions)):
    accuracy.append(test_predictions[i] == y_test[i])
print(sum(accuracy)/len(accuracy))

#### Jaccard

In [17]:
itemUsers = defaultdict(list)
usersIndex = defaultdict(int)
itemsIndex = defaultdict(int)

i = 0
for u in usersItems:
    usersIndex[u] = i
    i += 1
    
i = 0
for item in items:
    itemsIndex[item] = i
    itemUsers[item] = [0] * len(usersIndex)
    i += 1

In [18]:
for v in train:
    item = v['itemID']
    u = v['reviewerID']
    rating = v['rating']
    itemUsers[item][usersIndex[u]] = rating

TypeError: 'int' object is not iterable

In [19]:
itemsJaccards = defaultdict(list)
for item in itemUsers:
    jaccard = []
    for i in itemUsers:
        if i != item:
            union = set()
            intersection = set()
            for j in range(0, len(itemUsers[item])):
                if(itemUsers[item][j] > 0 or itemUsers[i][j] > 0):
                    union.add(j)
                if(itemUsers[item][j] > 0 and itemUsers[i][j] > 0):
                    intersection.add(j)
            jaccard.append([i, len(intersection)/len(union)])
    jaccard = sorted(jaccard, key=lambda x:-x[1])
    itemsJaccards[item] = jaccard

KeyboardInterrupt: 

In [None]:
itemsJaccards

In [13]:
def pred(user, item):
    if items[item] == []:
        return 0
    else:
        numPurchases = items[item][0]
        itemRating = items[item][1]
        itemCat = items[item][2]
        itemPrice = items[item][3]
        
        if usersItems[user] == []:
            if itemRating > avgRating:
                return 1
            elif itemCat == 0 and numPurchases > avgPurchases:
                return 1
            else:
                return 0
        else:
            userPurchases = usersItems[user]
            
            if any(userPurchases in itemsJaccards[item][:10]):
                return 1
            else:
                return 0
                

In [134]:
trainPreds = []
for v in train:
    user = v['reviewerID']
    item = v['itemID']
    trainPreds.append(pred(user, item))
    
print("Train Accuracy: " + str(sum(trainPreds)/len(train)))

Train Accuracy: 0.465225


In [16]:
validationPreds = []
validationY = []

for i in range(0, 200000):
    validationY.append(1 - i//100000)
for v in validation:
    user = v['reviewerID']
    item = v['itemID']
    validationPreds.append(pred(user, item))
    
accuracy = []
TP = 0
TN = 0
FP = 0
FN = 0
for i in range(0, len(validationY)):
    if validationPreds[i] == 1 and validationY[i] == 1:
        TP += 1
    elif validationPreds[i] == 1 and validationY[i] == 0:
        FP += 1
    elif validationPreds[i] == 0 and validationY[i] == 1:
        FN += 1
    elif validationPreds[i] == 0 and validationY[i] == 0:
        TN += 1
    accuracy.append(validationPreds[i] == validationY[i])
accuracy = sum(accuracy)/len(accuracy)
print("Validation Accuracy: " + str(accuracy))
print("TP = " + str(TP))
print("FP = " + str(FP))
print("FN = " + str(FN))
print("TN = " + str(TN))

Validation Accuracy: 0.553015
TP = 50546
FP = 39943
FN = 49454
TN = 60057


In [108]:
predictions = open("predictions_Purchase.txt", 'w')
for l in open("pairs_Purchase.txt"):
    if l.startswith("reviewerID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')

    predictions.write(u + '-' + i + "," + str(pred(u, i)) + "\n")
    

predictions.close()