In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os
#print(os.listdir("../input"))

# Plotly based imports for visualization
from plotly import tools
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [3]:
!python -m spacy download en_core_web_lg


[93m    Linking successful[0m
    /Users/matthewrallison/labenv/lib/python3.6/site-packages/en_core_web_lg
    -->
    /Users/matthewrallison/labenv/lib/python3.6/site-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')



In [4]:
wines = pd.read_csv('/Users/matthewrallison/SAGE/prodigy_test2/wine-reviews/wine1.csv')

In [5]:
nlp = spacy.load('en_core_web_lg')

In [6]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

In [8]:
doc = nlp(wines["description"][0])
spacy.displacy.render(doc, style='ent',jupyter=True)

In [9]:
review = str(" ".join([i.lemma_ for i in doc]))

In [10]:
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

In [27]:
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [12]:
tqdm.pandas()
wines["processed_description"] = wines["description"].progress_apply(spacy_tokenizer)

100%|██████████| 129971/129971 [06:40<00:00, 324.82it/s]


In [13]:
wines['description'][0]

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [14]:
wines['processed_description'][0]

'aromas include tropical fruit broom brimstone dry herb palate overly expressive offer unripened apple citrus dry sage alongside brisk acidity'

In [21]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(wines["processed_description"])

In [32]:
data_vectorized[0]

<1x9710 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [22]:
NUM_TOPICS = 3

In [23]:
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=1, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 1


In [24]:
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [25]:
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('flavor', 21044.852454989836), ('finish', 18010.231058813675), ('palate', 17510.486804853554), ('apple', 14118.11293596146), ('aroma', 13464.744024165831), ('white', 12538.492861058587), ('acidity', 11877.260183590422), ('fruit', 11331.514358097931), ('note', 11075.089283966603), ('citrus', 10130.53477476043)]
Topic 1:
[('wine', 62469.090974977946), ('fruit', 33851.21062905819), ('drink', 23233.578714212905), ('acidity', 19930.20145272179), ('flavor', 17430.55543722532), ('ripe', 17303.22503705121), ('texture', 15328.626858260126), ('rich', 14925.870937593612), ('age', 14038.407488125755), ('good', 13983.031974615924)]
Topic 2:
[('cherry', 33691.75710633292), ('flavor', 30026.088506855904), ('black', 24497.002046341473), ('aroma', 23484.704775947048), ('finish', 21468.94344757453), ('tannin', 21185.227014077107), ('palate', 20931.83644255391), ('fruit', 18894.905330146867), ('spice', 16387.665069280996), ('berry', 16093.338527009775)]


In [26]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

In [28]:
text = spacy_tokenizer("red")
print(lda.transform(vectorizer.transform([text])))

[[0.16670595 0.20759528 0.62569877]]


In [29]:
text = spacy_tokenizer("citrus")
print(lda.transform(vectorizer.transform([text])))


[[0.66318448 0.17014878 0.16666674]]
