In [1]:
import math
import numpy as np
import pandas as pd
import random
import urllib
import nltk
import string
from nltk.stem.porter import *
from collections import defaultdict
from sklearn import linear_model
from collections import Counter

In [2]:
def parseData(filename):
    for l in urllib.urlopen(filename):
        yield eval(l)
        
print("Reading data...")
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print("Done...")

Reading data...
Done...


In [3]:
punctuation = set(string.punctuation)
stemmer = PorterStemmer()

def feat(datum):
    r = ''.join([c for c in d['review/text'].lower() if c not in punctuation])
    return r

processed_data = [feat(d) for d in data]

### Question 1

In [4]:
bigrams = [b for l in processed_data for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
print("Number of unique bigrams in all revies: ")+str(len(set(bigrams)))

Number of unique bigrams in all revies: 195795


In [5]:
bigramCount = defaultdict(int)
for i in bigrams:
    bigramCount[i] += 1

freq = [(bigramCount[i],i) for i in bigramCount]
freq.sort(reverse = True)

print "Most frequent 5 bigrams:"
freq[:6]

Most frequent 5 bigrams:


[(4582, ('with', 'a')),
 (2576, ('in', 'the')),
 (2242, ('of', 'the')),
 (2053, ('is', 'a')),
 (2022, ('on', 'the')),
 (1878, ('a', 'bit'))]

### Question 2

In [6]:
bigrams_sets = [x[1] for x in freq[:1000]]
bigramID = dict(zip(bigrams_sets, range(len(bigrams_sets))))
uniqueBigrams = set(bigrams_sets)

In [7]:
def feature(datum):
    feat = [0]*len(uniqueBigrams)
    r = [b for b in zip(datum.split(" ")[:-1], datum.split(" ")[1:])]
    for bg in r:
        if bg in bigrams_sets:
            feat[bigramID[bg]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d) for d in processed_data]
y = [d['review/overall'] for d in data]        

In [8]:
clf = linear_model.Ridge(1.0, fit_intercept = False)
clf.fit(X,y)
theta = clf.coef_
predictions = clf.predict(X)
diff = np.array(y) - np.array(predictions)
MSE = float(sum(diff*diff.T))/len(predictions)
print("MSE = ")+str(MSE)

MSE = 0.346067568549


### Question 3

In [13]:
d_foam, d_smell, d_banana, d_lactic, d_tart = 0, 0, 0, 0, 0
for r in processed_data:
    if 'foam' in r.split():
        d_foam += 1
    if 'smell' in r.split():
        d_smell += 1
    if 'banana' in r.split():
        d_banana += 1
    if 'lactic' in r.split():
        d_lactic += 1
    if 'tart' in r.split():
        d_tart += 1

tf = defaultdict(int)
for w in processed_data[0].split():
    tf[w] += 1
    
N = float(len(processed_data))

print('IDF foam = ') + str(math.log10(N/d_foam))
print('IDF smell = ') + str(math.log10(N/d_smell))
print('IDF banana = ') + str(math.log10(N/d_banana))
print('IDF lactic = ') + str(math.log10(N/d_lactic))
print('IDF tart = ') + str(math.log10(N/d_tart))

print('TF-IDS score for foam: ') + str(tf['foam']*math.log10(N/d_foam))
print('TF-IDS score for smell: ') + str(tf['smell']*math.log10(N/d_smell))
print('TF-IDS score for banana: ') + str(tf['banana']*math.log10(N/d_banana))
print('TF-IDS score for lactic: ') + str(tf['lactic']*math.log10(N/d_lactic))
print('TF-IDS score for tart: ') + str(tf['tart']*math.log10(N/d_tart))

IDF foam = 1.13786862069
IDF smell = 0.537901618865
IDF banana = 1.67778070527
IDF lactic = 2.92081875395
IDF tart = 1.80687540165
TF-IDS score for foam: 2.27573724137
TF-IDS score for smell: 0.537901618865
TF-IDS score for banana: 3.35556141053
TF-IDS score for lactic: 5.8416375079
TF-IDS score for tart: 1.80687540165


### Question 4

In [80]:
wordCount = defaultdict(int)
for d in processed_data:
    for w in d.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
words = [x[1] for x in counts]

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)


tf1 = Counter(processed_data[0].split())
tf2 = Counter(processed_data[1].split())

In [81]:
df = defaultdict(int)
N = float(len(processed_data))

for l in processed_data:
    for w in wordSet:
        if w in l.split():
            df[w] += 1

In [83]:
tfidf1, tfidf2 = [], []

for w in wordSet:
    tfidf1.append(tf1[w]*np.log10(N/df[w]))
    tfidf2.append(tf2[w]*np.log10(N/df[w]))

from scipy import spatial
cosine_similarity = 1 - spatial.distance.cosine(tfidf1, tfidf2)
print("Cosine similarity between review 1 and 2 is ")+str(cosine_similarity)

Cosine similarity between review 1 and 2 is 0.06588193974744383


### Question 5

In [87]:
i = 1
best_cosine_similarity = cosine_similarity

for l in processed_data[1:]:
    tf = Counter(l.split())
    tfidf = []
    for w in wordSet:
        tfidf.append(tf[w]*np.log10(N/df[w]))
    cosine_similarity_new = 1 - spatial.distance.cosine(tfidf1, tfidf)
    if cosine_similarity_new > best_cosine_similarity:
        best_cosine_similarity = cosine_similarity_new
        rev_num = i
    i += 1

In [98]:
print("Review one is most similar to ")+str(rev_num)+(" having cosine similarity ")+ str(best_cosine_similarity)
print("Beer ID: ") + data[rev_num]['beer/beerId']
print("Profile Name: ") + data[rev_num]['user/profileName']
print("Review: ") + data[rev_num]['review/text']


Review one is most similar to 2343 having cosine similarity 0.2968679537499197
Beer ID: 72146
Profile Name: spicelab
Review: 750mL bottle thanks to Chris@Slowbeer. Poured into a Lost Abbey stemmed tulip.		Golden orange, close to translucent (on the first pour at least), capped by a sizable white, typically Belgian-looking head. Good lacing.		Quite strong lactic notes and a sharp organic funk. Pungent stuff. Underneath is bitter citrus pith, floral spice and a hint of sweet esters. In your face with a lot going on. Only issue is the lactic character verges on turning my stomach.		More citric sourness and a bit less lactic character. Grapefruit and lemon rind are prominent, as is the Nelson Sauvin vegetative character, which kind of adheres to the yeast and barnyard funk. Tropical melons and honey provide some sweetness. Decent peppery tang.		Medium, lightly syrupy body with lowish carbonation and a moderately tart, dry finish that has some length to it.		Incomparable to anything I've tr

### Question 6

In [104]:
counts.sort(reverse = True)
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def tfidf(datum):
    feat = []
    tf = Counter(datum.split())
    for w in wordSet:
        feat.append(tf[w]*np.log10(N/df[w]))
    feat.append(1)
    return feat

X = [tfidf(d) for d in processed_data]
y = [d['review/overall'] for d in data]  

In [105]:
clf = linear_model.Ridge(1.0, fit_intercept = False)
clf.fit(X,y)
theta = clf.coef_
predictions = clf.predict(X)
diff = np.array(y) - np.array(predictions)
MSE = float(sum(diff*diff.T))/len(predictions)
print("MSE = ")+str(MSE)

MSE = 0.278759560078


### Question 7