In [1]:
import psycopg2
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import nltk.tokenize as tk
import nltk
import string
import gensim
from gensim import corpora, models, similarities
from langdetect import DetectorFactory
from langdetect import detect
from datetime import datetime

DetectorFactory.seed = 0



In [2]:
## Cleaning and Preprocessing

stop = set(stopwords.words('english'))
additional_exclude_words = [u'beer', u'one', u'like', u'well', u'really', u'very', u'it\x92s', 
                            u'it', u'beers',u'would',u'taste',u'tastes',u'get',u'i\x92m',u'quite', u'i\x92ve',
                            u'bit',u'much',u'good',u'better',u'think',u'first',u'new',u'try', u'updated']
stop |= set(additional_exclude_words)
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc, name_tokens):
    #stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = " ".join([ch for ch in doc.lower().split() if ch not in exclude])
    normalized = " ".join([lemma.lemmatize(word) for word in punc_free.split()])
    tokens = tk.word_tokenize(normalized)
    tokens_with_pos = [x[0] for x in nltk.pos_tag(tokens) if x[1] in ('NN','JJ') and 
                       x[0] not in name_tokens and len(x[0])>2 and x[0] not in stop]
    return tokens_with_pos

In [3]:
## Establishing Connection
try:
    conn = psycopg2.connect('''
    dbname='brew' user='rthakur' 
    host='capstone-brewing.cyol5m7lekm6.us-east-1.rds.amazonaws.com' 
    password='brew'
    ''')
except:
    print "Unable to connect to the database"

In [2]:
dictionary = corpora.Dictionary.load('../../data/beerwords_v3.dict')
corpus = corpora.MmCorpus('../../data/beercorpus_v3.mm')

In [15]:
print str(datetime.now())
beer_model_multi =  models.LdaMulticore([corpus[2885]], id2word=dictionary, num_topics=200, workers=3)
print str(datetime.now())

2017-08-10 00:42:34.379000
2017-08-10 00:45:13.032000


In [5]:
print str(datetime.now())
beer_model =  models.LdaModel([corpus[2885]], id2word=dictionary, num_topics=200)
print str(datetime.now())

2017-08-10 00:25:55.753000
2017-08-10 00:30:38.539000


In [16]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(beer_model_multi,[corpus[2885]], dictionary)

vis.display()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


AttributeError: 'PreparedData' object has no attribute 'display'

In [9]:
cur = conn.cursor()
corpus = []

for x in range(4611):
    if x%500==0:
        print x
        print str(datetime.now())
    cur.execute("""
    select b.id,
       b.name,
       r.text
    from wrk.beer b left join wrk.review r on b.id = r.beer_id
    where b.id={0}
    """.format(x+1))
    rows = cur.fetchall()
    reviews_clean = []
    for row in rows:
        if not row[2]:
            continue
        review = row[2].decode('utf-8')
        name = row[1]
        name_tokens = nltk.word_tokenize(name.lower())
        try:
            if detect(review)=='en':
                rev_clean = clean(review.lower(), name_tokens)
                reviews_clean.extend(rev_clean)
        except:
            continue
    corpus += [dictionary.doc2bow(reviews_clean)]
    
corpora.MmCorpus.serialize('../../data/beercorpus_v3.mm', corpus)

0
2017-08-08 00:34:24.676000
500
2017-08-08 01:22:38.817000
1000
2017-08-08 02:06:29.189000
1500
2017-08-08 02:58:29.450000
2000
2017-08-08 03:30:55.395000
2500
2017-08-08 04:21:36.843000
3000
2017-08-08 05:00:33.062000
3500
2017-08-08 05:55:52.559000
4000
2017-08-08 06:30:15.661000
4500
2017-08-08 07:01:50.972000


In [10]:
#corpus = corpora.MmCorpus('../../data/beercorpus_v3.mm')

print str(datetime.now())
beer_model =  models.LdaModel(corpus, id2word=dictionary, num_topics=200)
print str(datetime.now())

beer_model.save('../../data/beer_model_v2')

2017-08-08 08:50:33.090000
2017-08-08 08:57:00.064000


In [11]:
#Similarity
index = similarities.MatrixSimilarity(beer_model[corpus])
kernel_estout = beer_model[corpus[2853]]
like_kernel = sorted(list(enumerate(index[kernel_estout])), key = lambda x:x[1], reverse=True)

In [13]:
print like_kernel

AttributeError: 'list' object has no attribute 'show_topics'

In [11]:
print corpus[20]

[(0, 171.0), (1, 91.0), (2, 105.0), (3, 37.0), (4, 2.0), (5, 3.0), (6, 642.0), (7, 7.0), (8, 423.0), (9, 19.0), (10, 25.0), (11, 39.0), (12, 2.0), (13, 103.0), (14, 748.0), (15, 116.0), (16, 843.0), (17, 1183.0), (18, 118.0), (20, 162.0), (21, 226.0), (22, 3.0), (23, 1.0), (24, 53.0), (25, 98.0), (26, 93.0), (27, 5.0), (29, 17.0), (30, 237.0), (31, 2.0), (32, 4.0), (34, 12.0), (35, 77.0), (36, 299.0), (38, 5.0), (40, 254.0), (41, 104.0), (43, 17.0), (45, 307.0), (46, 60.0), (47, 7.0), (49, 31.0), (50, 1144.0), (51, 8.0), (53, 23.0), (54, 3.0), (55, 77.0), (56, 3.0), (57, 635.0), (58, 302.0), (59, 9.0), (60, 872.0), (61, 410.0), (62, 21.0), (63, 91.0), (64, 23.0), (65, 4.0), (68, 5.0), (69, 16.0), (70, 161.0), (71, 20.0), (73, 12.0), (74, 177.0), (75, 24.0), (76, 45.0), (78, 137.0), (79, 8.0), (80, 4.0), (82, 68.0), (83, 7.0), (84, 107.0), (85, 5.0), (86, 586.0), (87, 10.0), (88, 72.0), (89, 15.0), (91, 14.0), (92, 8.0), (93, 10.0), (94, 249.0), (96, 24.0), (97, 2.0), (99, 227.0), (100,