In [437]:
import gzip
import math
import numpy as np
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you like
import re

import warnings
warnings.filterwarnings('ignore')

In [293]:
answers = {}

In [294]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [295]:
### Question 1

In [296]:
dataset = []

f = gzip.open("data/steam_category.json.gz")
for l in f:
    d = eval(l)
    dataset.append(d)
    if len(dataset) >= 20000:
        break
        
f.close()

In [297]:
Ntrain = 10000
Ntest = 10000

dataTrain = dataset[:Ntrain]
dataTest = dataset[Ntrain:Ntrain + Ntest]

In [298]:
sp = set(string.punctuation)

In [299]:
dataTrain[0]

{'userID': 'u74382925',
 'genre': 'Adventure',
 'early_access': False,
 'reviewID': 'r75487422',
 'hours': 4.1,
 'text': 'Short Review:\nA good starting chapter for this series, despite the main character being annoying (for now) and a short length. The story is good and actually gets more interesting. Worth the try.\nLong Review:\nBlackwell Legacy is the first on the series of (supposedly) 5 games that talks about the main protagonist, Rosangela Blackwell, as being a so called Medium, and in this first chapter we get to know how her story will start and how she will meet her adventure companion Joey...and really, that\'s really all for for now and that\'s not a bad thing, because in a way this game wants to show how hard her new job is, and that she cannot escape her destiny as a medium.\nMy biggest complain for this chapter, except the short length, it\'s the main protagonist being a "bit" too annoying to be likeable, and most of her dialogues will always be about complaining or just

In [300]:
allTrainText = [datum['text'] for datum in dataTrain]

In [301]:
# Add \ before certain punctuations as they represent special characters in regex

specialChars = ['.', '*', '?', '+', '^', '$', '[', ']', '(', ')']
spGrouped = '|'.join(sp)

saw = []
for _ in range(len(spGrouped)):
    if (spGrouped[_] in specialChars) and (spGrouped[_] not in saw):
        saw.append(spGrouped[_])
        spGrouped = spGrouped[:_]+'\\'+spGrouped[_:]

In [302]:
pattern = rf'{spGrouped}'

In [303]:
cleanTrainText = []

for _ in range(len(allTrainText)):
    newString = re.sub(pattern, '', allTrainText[_])
    cleanTrainText.append(newString.lower())

In [304]:
# Find most common words in train text corpus

commonWords = defaultdict(int)

for t in cleanTrainText:
    words = t.split()
    for w in words:
        if w == 'i':
            allIs.append(w)
        commonWords[w] += 1

In [305]:
commonWords = sorted(commonWords.items(), key=lambda x:x[1], reverse=True)

In [306]:
commonWords[:10]

[('the', 34211),
 ('and', 19392),
 ('a', 18791),
 ('to', 18077),
 ('game', 15043),
 ('of', 14095),
 ('is', 13000),
 ('you', 12735),
 ('i', 12201),
 ('it', 11824)]

In [307]:
counts = commonWords[:10]

In [308]:
counts

[('the', 34211),
 ('and', 19392),
 ('a', 18791),
 ('to', 18077),
 ('game', 15043),
 ('of', 14095),
 ('is', 13000),
 ('you', 12735),
 ('i', 12201),
 ('it', 11824)]

In [356]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataTrain:
  r = ''.join([c for c in d['text'].lower() if not c in punctuation])
  for w in r.split():
    wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [357]:
answers['Q1'] = counts[:10]

In [311]:
assertFloatList([x[0] for x in answers['Q1']], 10)

In [312]:
### Question 2

In [313]:
NW = 1000 # dictionary size

In [314]:
words = [_[1] for _ in countsClassCode[:1000]]

In [316]:
# Build X...

wordID = dict(zip(words, range(len(words))))

def feat_q2(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordID[w]] += 1
    return feat

X = [feat_q2(datum) for datum in dataset]

In [317]:
y = [datum['genreID'] for datum in dataset]

In [318]:
Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

In [319]:
mod = linear_model.LogisticRegression(C=1)

In [320]:
mod.fit(Xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [321]:
preds = mod.predict(Xtest)

In [322]:
correct = preds == ytest

In [326]:
answers['Q2'] = sum(correct) / len(correct)

In [327]:
assertFloat(answers['Q2'])

In [328]:
answers

{'Q1': [(34211, 'the'),
  (19392, 'and'),
  (18791, 'a'),
  (18077, 'to'),
  (15043, 'game'),
  (14095, 'of'),
  (13000, 'is'),
  (12735, 'you'),
  (12204, 'i'),
  (11824, 'it')],
 'Q2': 0.6374}

In [329]:
### Question 3

In [330]:
targetWords = ['character', 'game', 'length', 'a', 'it']

In [331]:
docFrequency = defaultdict(int)
for datum in dataTrain:
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for w in set(r.split()):
        docFrequency[w] += 1

In [346]:
def clean(text):
    r = ''.join([c for c in text.lower() if not c in punctuation])
    return r

In [347]:
# Term frequency for words in the first text corpus in train set

termFreq = defaultdict(int)
t = clean(dataTrain[0]['text'])
for w in t.split():
    termFreq[w] += 1

In [348]:
q3Answer = []
for w in targetWords:
    idf = math.log10(len(dataTrain) / docFrequency[w])
    tfidf = termFreq[w] * math.log10(len(dataTrain) / docFrequency[w])
    q3Answer.append((idf, tfidf))

In [350]:
answers['Q3'] = q3Answer

In [351]:
assertFloatList([x[0] for x in answers['Q3']], 5)
assertFloatList([x[1] for x in answers['Q3']], 5)

In [352]:
answers

{'Q1': [(34211, 'the'),
  (19392, 'and'),
  (18791, 'a'),
  (18077, 'to'),
  (15043, 'game'),
  (14095, 'of'),
  (13000, 'is'),
  (12735, 'you'),
  (12204, 'i'),
  (11824, 'it')],
 'Q2': 0.6374,
 'Q3': [(1.453457336521869, 1.453457336521869),
  (0.22951619056889208, 0.45903238113778416),
  (2.2441251443275085, 4.488250288655017),
  (0.3047810810948491, 2.4382486487587927),
  (0.376647318462008, 1.129941955386024)]}

In [None]:
### Question 4

In [361]:
topWords = [_[1] for _ in counts[:1000]]

In [364]:
wordID = dict(zip(topWords, range(len(topWords))))

In [368]:
df = defaultdict(int)

for datum in dataTrain:
    text = clean(datum['text'])
    for w in set(text.split()):
        if w in topWords:
            df[w] += 1

In [401]:
def feat_q4(text):
    feat = []
    text = clean(text)
    tf = defaultdict(int)

    for w in text.split():
        tf[w] += 1
    for w in topWords:
        tfidf = tf[w] * math.log10(len(dataTrain) / df[w])
        feat.append(tfidf)

    return feat

In [402]:
# Build X and y...

X = [feat_q4(datum['text']) for datum in dataset]
y = [datum['genreID'] for datum in dataset]

In [403]:
Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

In [404]:
mod = linear_model.LogisticRegression(C=1)

In [405]:
mod.fit(Xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [406]:
preds = mod.predict(Xtest)
correct = preds == ytest

In [407]:
answers['Q4'] = sum(correct) / len(correct)

In [408]:
assertFloat(answers['Q4'])

In [409]:
answers

{'Q1': [(34211, 'the'),
  (19392, 'and'),
  (18791, 'a'),
  (18077, 'to'),
  (15043, 'game'),
  (14095, 'of'),
  (13000, 'is'),
  (12735, 'you'),
  (12204, 'i'),
  (11824, 'it')],
 'Q2': 0.6374,
 'Q3': [(1.453457336521869, 1.453457336521869),
  (0.22951619056889208, 0.45903238113778416),
  (2.2441251443275085, 4.488250288655017),
  (0.3047810810948491, 2.4382486487587927),
  (0.376647318462008, 1.129941955386024)],
 'Q4': 0.6217}

In [410]:
### Question 5

In [414]:
def Cosine(x1,x2):
    numer, norm1, norm2 = 0, 0, 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1*norm2)
    return 0

In [418]:
similarities = {}
firstInTest = feat_q4(dataTrain[0]['text'])

for datum in dataTest:
    feat = feat_q4(datum['text'])
    sim = Cosine(firstInTest, feat)
    similarities[datum['reviewID']] = sim

In [420]:
# similarities.sort(reverse=True)

In [422]:
similarities = sorted(similarities.items(), key=lambda x:x[1], reverse=True)

In [425]:
answers['Q5'] = (similarities[0][1], similarities[0][0])

In [426]:
assertFloat(answers['Q5'][0])

In [427]:
answers

{'Q1': [(34211, 'the'),
  (19392, 'and'),
  (18791, 'a'),
  (18077, 'to'),
  (15043, 'game'),
  (14095, 'of'),
  (13000, 'is'),
  (12735, 'you'),
  (12204, 'i'),
  (11824, 'it')],
 'Q2': 0.6374,
 'Q3': [(1.453457336521869, 1.453457336521869),
  (0.22951619056889208, 0.45903238113778416),
  (2.2441251443275085, 4.488250288655017),
  (0.3047810810948491, 2.4382486487587927),
  (0.376647318462008, 1.129941955386024)],
 'Q4': 0.6217,
 'Q5': (0.48266773388727985, 'r65720011')}

In [None]:
### Question 6

In [442]:
cValues = np.linspace(0.1, 1, 5)
bestAcc = None

for c in cValues:
    mod = linear_model.LogisticRegression(C=c)
    mod.fit(Xtrain, ytrain)
    preds = mod.predict(Xtest)
    correct = preds == ytest
    acc = sum(correct) / len(correct)
    print(acc)
    if acc > max(answers['Q2'], answers['Q4']):
        print(f"Found better c: c = {c}, acc = {acc}")
        if bestAcc is None: 
            bestAcc = acc
        elif bestAcc < acc:
            bestAcc = acc

0.6455
Found better c: c = 0.1, acc = 0.6455
0.6318
0.6267
0.6235
0.6217


In [443]:
answers['Q6'] = bestAcc

In [444]:
assertFloat(answers['Q6'])

In [445]:
answers

{'Q1': [(34211, 'the'),
  (19392, 'and'),
  (18791, 'a'),
  (18077, 'to'),
  (15043, 'game'),
  (14095, 'of'),
  (13000, 'is'),
  (12735, 'you'),
  (12204, 'i'),
  (11824, 'it')],
 'Q2': 0.6374,
 'Q3': [(1.453457336521869, 1.453457336521869),
  (0.22951619056889208, 0.45903238113778416),
  (2.2441251443275085, 4.488250288655017),
  (0.3047810810948491, 2.4382486487587927),
  (0.376647318462008, 1.129941955386024)],
 'Q4': 0.6217,
 'Q5': (0.48266773388727985, 'r65720011'),
 'Q6': 0.6455}

In [446]:
### Question 7

In [447]:
import dateutil.parser

In [454]:
dataset = []

f = gzip.open("data/young_adult_20000.json.gz")
for l in f:
    d = eval(l)
    # print(d['date_added'])
    # print(type(d['date_added']))
    d['datetime'] = dateutil.parser.parse(d['date_added'])
    # print(d['datetime'])
    # print(type(d['datetime']))
    dataset.append(d)
    if len(dataset) >= 20000:
        break
        
f.close()

In [455]:
dataset[0]

{'user_id': 'dc3763cdb9b2cae805882878eebb6a32',
 'book_id': '18471619',
 'review_id': '66b2ba840f9bd36d6d27f46136fe4772',
 'rating': 3,
 'review_text': 'Sherlock Holmes and the Vampires of London \n Release Date: April 2014 \n Publisher: Darkhorse Comics \n Story by: Sylvain Cordurie \n Art by: Laci \n Colors by: Axel Gonzabo \n Cover by: Jean Sebastien Rossbach \n ISDN: 9781616552664 \n MSRP: $17.99 Hardcover \n "Sherlock Holmes died fighting Professor Moriarty in the Reichenbach Falls. \n At least, that\'s what the press claims. \n However, Holmes is alive and well and taking advantage of his presumed death to travel the globe. \n Unfortunately, Holmes\'s plans are thwarted when a plague of vampirism haunts Britain. \n This book collects Sherlock Holmes and the Vampires of London Volumes 1 and 2, originally created by French publisher Soleil." - Darkhorse Comics \n When I received this copy of "Sherlock Holmes and the Vampires of London" I was Ecstatic! The cover art was awesome and 

In [459]:
reviewsPerUser = defaultdict(list)
for r in dataset:
    uid, bid = r['user_id'], r['book_id']
    reviewsPerUser[uid].append(bid)

In [462]:
reviewLists = []
for u in reviewsPerUser:
    rl = list(reviewsPerUser[u])
    rl.sort()
    reviewLists.append(rl)

In [464]:
model5 = Word2Vec(reviewLists,
                  min_count=1, # Words/items with fewer instances are discarded
                  vector_size=5, # Model dimensionality
                  window=3, # Window size
                  sg=1) # Skip-gram model

In [466]:
firstRev = dataset[0]['book_id']

In [467]:
firstRev

'18471619'

In [468]:
res = model5.wv.similar_by_word(firstRev)

In [470]:
answers['Q7'] = res[:5]

In [472]:
assertFloatList([x[1] for x in answers['Q7']], 5)

In [473]:
f = open("answers_hw4.txt", 'w')
f.write(str(answers) + '\n')
f.close()