In [40]:
import gzip
import random
import matplotlib.pyplot as plt
import string
import nltk
import numpy
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from collections import defaultdict

### Purchase

#### Splitting data

In [2]:
def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

In [3]:
data = []
for l in readGz("train.json.gz"):
    data.append(l)

In [4]:
train = data[:100000]

#### Getting Users and Items features

In [5]:
items = defaultdict(list)
usersItems = defaultdict(list)
for v in train:
    user = v['reviewerID']
    itemID = v['itemID']
    usersItems[user].append(itemID)
    if(items[itemID] == []):
        if('price' in v):
            items[itemID] = [1, v['rating'], v['categoryID'], v['price']]
        else:
            items[itemID] = [1, v['rating'], v['categoryID'], -1]
    else:
        if(items[itemID][3] == -1 and 'price' in v):
            items[itemID][3] = v['price']
        items[itemID][0] += 1
        items[itemID][1] = (items[itemID][1]*(items[itemID][0]-1) + v['rating'] )/items[itemID][0]

In [6]:
usersFeatures = defaultdict(list)
for user in usersItems:
    avgPrice = []
    avgRating = []
    #getting number of items bought per category
    uCats = [0,0,0,0,0]
    for i in [items[x] for x in usersItems[user]]:
        itemRating = i[1]
        itemCat = i[2]
        itemPrice = i[3]
        
        avgRating.append(itemRating)
        uCats[itemCat] += 1
        if itemPrice != -1:
            avgPrice.append(itemPrice)
            
    avgRating = sum(avgRating)/len(avgRating)
    if(avgPrice != []):
        avgPrice = sum(avgPrice)/len(avgPrice)
    else:
        avgPrice = -1
    
    usersFeatures[user] = [len(usersItems[user]), avgRating, uCats, avgPrice]

In [7]:
# overall data
avgRating = sum([v['rating'] for v in train])/len(train)
avgPrice = sum([v['price'] for v in train if 'price' in v])/len([v['price'] for v in train if 'price' in v])

In [8]:
avgPurchases = 0
for i in items:
    avgPurchases += items[i][0]
avgPurchases /= len(items)

In [9]:
mostPopular = []
for item in items:
    itemPurchases = items[item][0]
    mostPopular.append([item, itemPurchases])
mostPopular = sorted(mostPopular, key=lambda x:-x[1])

### Question 6

In [20]:
#nltk.download('stopwords')
stops = set(nltk.corpus.stopwords.words("english"))

In [21]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for v in train:
    r = ''.join([c for c in v['reviewText'].lower() if not c in punctuation])
    for w in r.split():
        if w not in stops:
            wordCount[w] += 1

In [22]:
wordCount = [(wordCount[w], w) for w in wordCount]
wordCount.sort()
wordCount.reverse()

In [23]:
totalWords = sum([v[0] for v in wordCount])

In [24]:
frequency = [(v[0]/totalWords, v[1]) for v in wordCount]

In [26]:
train = data[:10000]

In [27]:
catWordCount = [defaultdict(int), defaultdict(int), defaultdict(int), defaultdict(int), defaultdict(int)]
catTotalWords = [0, 0, 0, 0, 0]
catFrequency = []
for i in range(0, 5):
    for v in [v for v in train if v['categoryID'] == i]:
        r = ''.join([c for c in v['reviewText'].lower() if not c in punctuation])
        for w in r.split():
            if w not in stops:
                catWordCount[i][w] += 1
    
    catWordCount[i] = [(catWordCount[i][w], w) for w in catWordCount[i]]
    catWordCount[i].sort()
    catWordCount[i].reverse()
    
    catTotalWords[i] = sum([v[0] for v in catWordCount[i]])
    catFrequency.append([(v[0]/catTotalWords[i], v[1]) for v in catWordCount[i]])

In [28]:
for i in range(0, 5):
    print('CategoryID = ' + str(i))
    for v in catFrequency[i][:10]:
        f = v[0] - [x[0] for x in frequency if x[1] == v[1]][0]
        print(v[1] + ' - ' + str(f))
    print()

CategoryID = 0
like - 0.0007435798517148318
size - 0.000987219988863773
wear - 0.0015023452869715407
fit - 0.00019422137192087914
love - 0.001970423975895151
great - -1.5671500178185416e-06
would - 0.0005144242690138393
comfortable - 0.000617626027785232
good - -0.0007634278315510291
well - -0.0007377992204788096

CategoryID = 1
great - 0.0009385949786824225
like - -0.0011315121688540743
fit - -0.00017393209593557016
good - 0.0023114784103487147
size - -0.0025232482613693555
well - 0.0009425016044954538
wear - -0.003049354818995994
comfortable - -0.00034845593192095545
shoes - 0.00017643356198081533
one - -0.00016550243465549008

CategoryID = 2
size - 0.002295690578360184
great - 0.002036195219520146
daughter - 0.010265789015847116
fit - 3.096993744747083e-05
like - -0.0012394986791288646
well - 0.0026628004880501228
old - 0.007435301628700777
would - 0.0005125431406813831
small - 0.0037091593080769945
bought - 0.002528429957027946

CategoryID = 3
son - 0.014237189969292501
old - 0.013

### Question 7

In [30]:
validation = subSet[10000:20000]

In [31]:
X_train = []
y_train = []
X_validation = []
y_validation = []
for v in train:
    if v['categoryID'] != 0:
        v['categoryID'] = 1
    features = []
    for i in range(0, 500):
        features.append(frequency[i][1] in v['reviewText'])
    features.append(v['rating'])
    y_train.append(v['categoryID'])
    X_train.append(features)
    
        
for v in validation:
    if v['categoryID'] != 0:
        v['categoryID'] = 1
    features = []
    for i in range(0, 500):
        features.append(frequency[i][1] in v['reviewText'])
    features.append(v['rating'])
    y_validation.append(v['categoryID'])
    X_validation.append(features)

In [46]:
kmeans = KMeans(n_clusters=5).fit(X_train, y_train)

In [47]:
trainPred = kmeans.predict(X_train)

In [32]:
thetas = []

theta,residuals,rank,s = numpy.linalg.lstsq(X_train, y_train)

  """Entry point for launching an IPython kernel.


In [34]:
trainPred = []
for v in X_train:
    trainPred.append(sum(v*theta))

In [36]:
for i in range(0, len(trainPred)):
    if trainPred[i] < 0.5:
        trainPred[i] = 0
    else:
        trainPred[i] = 1
    

In [48]:
accuracy_score(y_train, trainPred)

0.1733

In [126]:
test = []
for l in readGz("test_Category.json.gz"):
    test.append(l)

In [127]:
X_test=[]
for v in test:
    features = []
    for i in range(0, 500):
        features.append(frequency[i][1] in v['reviewText'])
    X_test.append(features)

In [50]:
preds = clf.predict(X_test)

In [53]:
predictions = open("predictions_Category.txt", 'w')
i = 0
for l in open("pairs_Category.txt"):
    if l.startswith("reviewerID"):
        #header
        predictions.write(l)
        continue
    u,r = l.strip().split('-')
    
    
    predictions.write(u + '-' + r + ',' + str(preds[i]) + '\n')
    i += 1
predictions.close()

The best performance was an accuracy of 0.7933 using C = 10

### Question 8

In [57]:
newTrain = train[:10000]
newValidation = validation[:10000]

In [61]:
allPredictions = []
allSvms = []
realValidationY = [v['categoryID'] for v in newValidation]
for c in [0.1]:#, 0.1, 1, 10, 100]:
    svms = []
    print(str(c))
    for i in range(0, 5):
        print('Category = ' + str(i))
        X_train = []
        y_train = []
        X_validation = []
        y_validation = []
        for v in newTrain:
            if v['categoryID'] != i:
                v['categoryID'] = (i+1)%5
            features = []
            for j in range(0, 500):
                features.append(frequency[j][1] in v['reviewText'])
            y_train.append(v['categoryID'])
            X_train.append(features)


        for v in newValidation:
            if v['categoryID'] != i:
                v['categoryID'] = (i+1)%5
            features = []
            for j in range(0, 500):
                features.append(frequency[j][1] in v['reviewText'])
            y_validation.append(v['categoryID'])
            X_validation.append(features)

        test_accuracies = []

        clf = svm.LinearSVC(C=c)
        clf.fit(X_train, y_train)
        print('Fit done')
        svms.append(clf)
        
    test_predictions = []
    print('Predicting')
    for v in X_validation:
        confidency = [svms[j].decision_function([v]) for j in range(0, 5)]
        posConf = [x for x in confidency if x > 0]
        posConf.sort()
        if posConf == []:
            test_predictions.append(0)
        else:
            test_predictions.append(confidency.index(posConf[0]))
    allPredictions.append([c, test_predictions])
    allSvms.append(svms)
    print('Finished')
    print()

0.1
Category = 0
Fit done
Category = 1
Fit done
Category = 2
Fit done
Category = 3
Fit done
Category = 4
Fit done
Predicting
Finished



In [62]:
test_accuracies = []
for v in allPredictions:
    predicted = v[1]
    accuracy = []
    for i in range(0, len(predicted)):
        accuracy.append(predicted[i] == realValidationY[i])
    test_accuracies.append((v[0], sum(accuracy)/len(accuracy)))

In [63]:
test_accuracies

[(0.1, 0.1528)]

In [15]:
test = []
for l in readGz("test_Category.json.gz"):
    test.append(l)

In [19]:
test[2]

{'reviewTime': '03 21, 2012',
 'reviewText': 'I wanted a formal watch that had a big face and this definitely was the coolest watch out there with a great price! I have gotten many compliments on this watch. You will have to go to a jeweler or watch store to get it sized once you get it. My only complaint is that the turn dial on the face makes squeaky noises when you move around the watch.',
 'helpful': {'nHelpful': 0, 'outOf': 0},
 'reviewerID': 'U433746872',
 'reviewHash': 'R750304163',
 'unixReviewTime': 1332288000,
 'rating': 5.0,
 'summary': 'Bold, Large-face Watch'}

In [56]:
X_test=[]
for v in test:
    features = []
    for i in range(0, 500):
        features.append(frequency[i][1] in v['reviewText'])
    X_test.append(features)

In [60]:
predictions = open("predictions_Category.txt", 'w')
i = 0
for l in open("pairs_Category.txt"):
    if l.startswith("reviewerID"):
        #header
        predictions.write(l)
        continue
    u,r = l.strip().split('-')
    
    confidency = [allSvms[3][j].decision_function([X_test[i]]) for j in range(0, 5)]
    posConf = [x for x in confidency if x > 0]
    posConf.sort()
    if posConf == []:
        predictions.write(u + '-' + r + ',0\n')
    else:
        predictions.write(u + '-' + r + ',' + str(confidency.index(posConf[0])) + '\n')
    i += 1
predictions.close()

In [104]:
categories = [[],[],[],[],[]]
for v in train:
    cats = [y for x in v['categories'] for y in x]
    for x in cats:
        categories[v['categoryID']].append(x)

In [111]:
for i in range(0,5):
    categories[i] = list(set(categories[i]))

In [118]:
for i in range(0,5):
    for cat in categories[i]:
        for j in range(0,5):
            if i != j:
                if cat in categories[j]:
                    categories[j].remove(cat)
                if cat in categories[i]:
                    categories[i].remove(cat)

In [119]:
categories

[['Berets',
  'Plugs',
  'Babydolls',
  'Marc by Marc Jacobs',
  'Wedding',
  'LifeStride',
  'Shoulder Bags',
  'Brooches & Pins',
  'LeSportsac',
  'Rafters',
  'Multitools',
  'Miz Mooz',
  'Boy Meets Girl',
  'California Magdesians',
  'Blazers',
  'Easy Street',
  'E',
  'Luggage & Travel Gear',
  'Anne Klein',
  'Socks & Hosiery',
  'Food Service',
  'Bodysuits',
  'EMU Australia',
  'Drop & Dangle',
  'Trench & Rain',
  'Down & Parkas',
  'The SAK',
  'Shells',
  'Swatch Watches',
  'GUESS? Watches',
  'Bustiers & Corsets',
  'Anniversary Rings',
  'Bead',
  'Exotic Apparel',
  'Z',
  'Bomber Hats',
  'ECCO',
  'Capri Pants',
  'Pearl Strands',
  'Football',
  'Yoga',
  'BCBGeneration',
  'Tracksuits',
  'Wrap',
  'Night Out & Cocktail',
  'Barbells',
  'Tights',
  'Rocket Dog',
  'Betsey Johnson',
  'Dr. Martens',
  'Fine Jewelry',
  'N.Y.L.A.',
  'Suits & Blazers',
  'Tights & Hosiery',
  'Envirosax',
  'Other Sports',
  'Motorcycle',
  'Pendants',
  'STEVEN by Steve Madden',


In [128]:
predictions = open("predictions_Category.txt", 'w')
i = 0
for l in open("pairs_Category.txt"):
    if l.startswith("reviewerID"):
        #header
        predictions.write(l)
        continue
    u,r = l.strip().split('-')
    
    cats = test[i]['categories']
    cats = [y for x in cats for y in x]
    predCat = -1
    for cat in cats:
        if cat in categories[0]:
            predCat = 0
            break
        elif cat in categories[1]:
            predCat = 1
            break
        elif cat in categories[2]:
            predCat = 2
            break
        elif cat in categories[3]:
            predCat = 3
            break
        elif cat in categories[4]:
            predCat = 4
            break
    
    if predCat == -1:    
        if u in usersFeatures:
            predictions.write(u + '-' + r + "," + str(usersFeatures[u][2].index(max(usersFeatures[u][2]))) + "\n")
        else:
            predictions.write(u + '-' + r + ",0\n")
    else:
        predictions.write(u + '-' + r + ',' + str(predCat) + '\n')
    i += 1
predictions.close()

KeyError: 'categories'

In [129]:
test[i]

{'reviewTime': '07 26, 2013',
 'reviewText': 'I love this blouse, in fact I have it on right now....all of my friends like it and want one.',
 'helpful': {'nHelpful': 9, 'outOf': 9},
 'reviewerID': 'U281659737',
 'reviewHash': 'R934811302',
 'unixReviewTime': 1374796800,
 'rating': 5.0,
 'summary': 'love it'}

In [149]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
trainPreds = knn.predict(X_train)

In [150]:
accuracy_score(y_train, trainPreds)

0.7353

In [151]:
testPreds = knn.predict(X_test)

In [152]:
predictions = open("predictions_Category.txt", 'w')
i = 0
for l in open("pairs_Category.txt"):
    if l.startswith("reviewerID"):
        #header
        predictions.write(l)
        continue
    u,r = l.strip().split('-')
    
    
    predictions.write(u + '-' + r + ',' + str(testPreds[i]) + '\n')
    i += 1
predictions.close()