In [1]:
import time
load_start_time = time.time()

import csv
import nltk
import re
import numpy as np
import datetime

import gensim, logging
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
#from sklearn.cluster import KMeans
#from sklearn.decomposition import PCA

from scipy.spatial.distance import cdist, cosine
#from scipy.spatial import cKDTree

from gensim.models import Word2Vec

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from bokeh.io import curdoc
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import Slider, TextInput, PreText, Select
from bokeh.plotting import figure





###load data
with open('./android/ebay/total_info.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)

ratings = []
reviews = []
titles = []
dates = []
versions = []

for line in d:
    vals = line[0].split("******")
    ratings.append(float(vals[0]))
    reviews.append(vals[1])
    dates.append(vals[2])
    versions.append(vals[3])
    
    
stop_words = stopwords.words('english')
def custom_preprocessor(text):
    porter = PorterStemmer()

    #split into sentences
    sentences = sent_tokenize(text)
    
    final_sentences = []
    
    for sentence in sentences:
        sentence_split = sentence.split(" ")
        
        #remove words in not in stop words, and make lowercase
        words = [word.lower() for word in sentence_split if word.lower() not in stop_words]
        #get rid of words with non alphanumeric characters in it
        #(should we replace these with a token?)
        words = [word for word in words if word.isalpha()]
        #stem words
        words = [porter.stem(word) for word in words]

        final_sentences.append(" ".join(words))
        
        #consider joining sentences with a stop token
    return " ".join(final_sentences), final_sentences, sentences


processed_reviews = []
processed_sentences = []
raw_sentences = []
for review in reviews:
    processed_review = custom_preprocessor(review) 
    processed_reviews.append(processed_review[0])
    processed_sentences.append(processed_review[1])
    raw_sentences.append(processed_review[2])
    
    
#get rid of reviews that are empty after preprocessing
#(not that many)

processed_review_lens = np.array([len(review) for review in [r.split(" ") for r in processed_reviews]])
#if using stop tokens "<END>" then empty reviews have a length of 6
nonzero_indeces = np.where(processed_review_lens > 0)


final_reviews =  [review.split(" ") for review in np.array(processed_reviews)[nonzero_indeces]]
final_reviews_unprocessed =  np.array(reviews)[nonzero_indeces]
final_ratings = np.array(ratings)[nonzero_indeces]
#final_titles = np.array(titles)[nonzero_indeces]
final_dates = np.array(dates)[nonzero_indeces]
unique_dates = np.unique(np.array(final_dates))
unique_date_indices = []
for date in unique_dates:
        date_indices = np.where(np.array(final_dates)==date)[0]
        unique_date_indices.append(date_indices)
final_versions = np.array(versions)[nonzero_indeces]




model = Word2Vec.load("../../large files/ebay.model")
#model = Word2Vec(final_reviews, min_count=1)
#model.save("../../large files/youtube_w2v_stoptokens.model")








#here we create a vector for each review,
#which will be the simple average of all word vectors in that review.
#these vectors will then be used for clustering, data reduction, etc.
avg_vectors = []
for review in final_reviews:
    avg_vectors.append(np.mean([model.wv[word] for word in review], axis=0))
    
avg_vectors = np.array(avg_vectors)  


final_review_lengths = [len(review.split(" ")) for review in final_reviews_unprocessed]

#scaling 
scaler = StandardScaler()
avg_vectors_scaled = scaler.fit_transform(avg_vectors)
print(avg_vectors.shape)
print(avg_vectors_scaled.shape)

def find_relevant_reviews(key, avg_vectors, raw_text, n=10, scaled=False):
    
    indices = None
    distances = None
    
    key_processed = custom_preprocessor(key)[0]
    key_list = key_processed.split(' ')
    #filter to only those in the w2v model's covacbulary
    vocab = model.wv.vocab.keys()
    key_list_vocab_words = []
    key_list_nonvocab_words = []
    for word in key_list:
        if word in vocab:
            key_list_vocab_words.append(word)
        else:
            key_list_nonvocab_words.append(word)
    
    #only move on if the list isn't empty (i.e. if the key had no words in the vocab)
    if len(key_list_vocab_words) > 0:
        key_vector = np.mean([model.wv[word] for word in key_list_vocab_words if word in model.wv.vocab.keys()], axis=0)
        
        if scaled:
            key_vector = scaler.transform(np.array(key_vector).reshape(1,-1))

        distances = [1-cosine(key_vector, vector) for vector in avg_vectors]
        indices = np.argsort(distances)[-n:]
    else:
        print("Warning: none of the words in the query were in the model's vocabulary.")
    
    if len(key_list_nonvocab_words) > 0:
        print("Excluded words:", key_list_nonvocab_words)
        
    return indices, distances
    
def print_relevant_reviews(indices, distances):
    text = "Top reviews in selected timeframe:\n\n"
    for i in range(len(indices)):
        index = indices[-(i+1)]
        text += "'"+str(final_reviews_unprocessed[index])+"' (Rating: "+str(final_ratings[index])+", Cosine similarity: )"+str(round(distances[index], 4))+"\n"
        
    return text
    

    
def get_topic_evolution_data(distances, query, relevance_threshold = 0.8):
    
    
    mean_distances = []
    n_relevant_reviews = []
    percent_relevant_reviews= []
    mean_ratings = []
    
    for date_indices in unique_date_indices:
        date_distances = np.array(distances)[date_indices]
        mean_distances.append(np.mean(date_distances))
        
        relevant_indeces = np.where(date_distances > relevance_threshold)[0]
        n = len(relevant_indeces)
        percent = 100*(n/len(date_indices))

        n_relevant_reviews.append(n)
        percent_relevant_reviews.append(percent)
        
        date_ratings = np.array(final_ratings)[date_indices][relevant_indeces]
        mean_ratings.append(np.mean(date_ratings))
       
    return mean_distances

    

    
    
    
    
#initialize variables for gui
#interesting ones: 'fast forward button', 'crash', 'freeze', 'update'
query = 'i love this app'

indices, distances = find_relevant_reviews(query, avg_vectors_scaled, final_reviews_unprocessed, n=10, scaled=True)
mean_distances = get_topic_evolution_data(distances, query)



#indices, distances = find_relevant_reviews(query, avg_vectors, final_reviews_unprocessed, n=10, scaled=False)
#for i in range(len(indices)):
#    index = indices[-(i+1)]
#    print(final_reviews_unprocessed[index])
#    print("Rating:", final_ratings[index])
#    print("Cosine similarity:", round(distances[index], 4))
#    print('*******\n')
    
    
    



(35483, 100)
(35483, 100)


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [7]:
from bokeh.io import output_notebook
from bokeh.plotting import show

In [62]:
# Set up data

source = ColumnDataSource(data=dict(x=range(len(unique_dates)), y=mean_distances))



tools = 'pan,wheel_zoom,xbox_select,reset'

# Set up plot
plot = figure(plot_height=400, plot_width=1000, title="", tools=tools, active_drag="xbox_select")

plot.line('x', 'y', source=source, line_width=3, line_alpha=1, selection_color="red",)


# Set up widgets
query = TextInput(title="query", value='i love this app')
relevance_threshold = Slider(title="relevance threshold", value=0.8, start=0, end=1, step=0.01)
reviews = PreText(text='help', width=1000)














def update_data(attrname, old, new):
    

    # Get the current slider values
    q = query.value
    plot.title.text = q
    r = relevance_threshold.value
    
    #note - n=10 argument is dumb
    indices, distances = find_relevant_reviews(q, avg_vectors_scaled, final_reviews_unprocessed, n=10, scaled=True)
    mean_distances = get_topic_evolution_data(distances, q, r)

    source.data = dict(x=range(len(unique_dates)), y=mean_distances)
    
    

for w in [query, relevance_threshold]:
    w.on_change('value', update_data)


# Set up layouts and add to document

col1 = column(query, relevance_threshold, width=300)
col2 = column(plot, reviews, width=1000)
layout = row(col1, col2)

curdoc().add_root(layout)
curdoc().title = "Sliders"

In [64]:
def find_relevant_reviews(key, avg_vectors, raw_text, scaled=False):
    
    indices = None
    distances = None
    
    key_processed = custom_preprocessor(key)[0]
    key_list = key_processed.split(' ')
    #filter to only those in the w2v model's covacbulary
    vocab = model.wv.vocab.keys()
    key_list_vocab_words = []
    key_list_nonvocab_words = []
    for word in key_list:
        if word in vocab:
            key_list_vocab_words.append(word)
        else:
            key_list_nonvocab_words.append(word)
    
    #only move on if the list isn't empty (i.e. if the key had no words in the vocab)
    if len(key_list_vocab_words) > 0:
        key_vector = np.mean([model.wv[word] for word in key_list_vocab_words if word in model.wv.vocab.keys()], axis=0)
        
        if scaled:
            key_vector = scaler.transform(np.array(key_vector).reshape(1,-1))

        distances = [1-cosine(key_vector, vector) for vector in avg_vectors]
        indices = np.argsort(distances)
    else:
        print("Warning: none of the words in the query were in the model's vocabulary.")
    
    if len(key_list_nonvocab_words) > 0:
        print("Excluded words:", key_list_nonvocab_words)
        
    return indices, distances

In [100]:
def print_relevant_reviews(indices, distances, reviews, n=10):
    text = "Most relevant reviews in selected timeframe:\n\n"
    top_indices = indices[-n:]
    for i in range(len(top_indices)):
        index = top_indices[-i-1]
        distance = distances[index]
        text += "'"+str(reviews[index])+", Cosine similarity: "+str(round(distance, 4))+")\n"
        #text += "'"+str(final_reviews_unprocessed[index])+"' (Rating: "+str(final_ratings[index])+")\n"
        
    return text

In [30]:
distances

[0.3997596502304077,
 -0.5186205506324768,
 -0.09348415583372116,
 -0.17506615817546844,
 0.30920809507369995,
 -0.2802387773990631,
 0.6849067807197571,
 -0.020746588706970215,
 0.07899312674999237,
 0.37586653232574463,
 -0.24536308646202087,
 -0.02668139524757862,
 -0.10839027911424637,
 -0.4573737680912018,
 -0.2937600612640381,
 -0.2553093731403351,
 -0.23711340129375458,
 -0.46547257900238037,
 0.6306943297386169,
 0.3091528117656708,
 0.10335270315408707,
 0.31408900022506714,
 -0.24246886372566223,
 -0.046748433262109756,
 -0.3544546663761139,
 0.08533256500959396,
 -0.32829374074935913,
 0.19372153282165527,
 0.01108125876635313,
 0.02688611112535,
 -0.30430975556373596,
 -0.00948191899806261,
 0.21912455558776855,
 0.2758767008781433,
 -0.10417129844427109,
 0.03815525770187378,
 -0.2679691016674042,
 -0.23753537237644196,
 -0.21146878600120544,
 -0.6017494201660156,
 -0.3197554647922516,
 -0.009255565702915192,
 -0.037626560777425766,
 -0.12498345226049423,
 -0.4225340783596

In [88]:
indices[-n:]

array([ 9862,  6363,  6347, 19973, 19982, 18500, 11283,  6015,  8116,
       11075], dtype=int64)

In [None]:
0, -1
1, -2
2, -3

In [99]:
top_indices = indices[-n:]
for i in range(len(top_indices)):
    index = top_indices[-i-1]
    distance = distances[index]
    print(index, distance)

11075 0.8187554478645325
8116 0.8139020800590515
6015 0.8119716048240662
11283 0.801028311252594
18500 0.7989060878753662
19982 0.7701683640480042
19973 0.7701683640480042
6347 0.7630482316017151
6363 0.7630482316017151
9862 0.7590345740318298


In [89]:
print(final_reviews_unprocessed[x].shape)
print(len(distances))
print(len(indices))

(24923,)
24923
24923


In [101]:
n = 10
x = [item for sublist in unique_date_indices[2:200] for item in sublist]
indices, distances = find_relevant_reviews('freeze', avg_vectors_scaled[x], final_reviews_unprocessed[x], scaled=True)
print(print_relevant_reviews(indices, distances, final_reviews_unprocessed[x], n=10))

Most relevant reviews in selected timeframe:

'keep crash second after start up . it do run fine a long time ago . that update garbage kick me out of business . suck ., Cosine similarity: 0.8188)
'have far more problem since update then before . slow load and force close ., Cosine similarity: 0.8139)
'constantly sign out crash every 2 second basically you fuck it with the update today absolutely ruin it uninstalling it now . on galaxy note 3 ., Cosine similarity: 0.812)
'freeze up n force close often . but good app !, Cosine similarity: 0.801)
'everytime i try to load something , the darn app freeze and then crash my phone . oh joy ., Cosine similarity: 0.7989)
'run quite slow , have kick me out a few time for no reason ., Cosine similarity: 0.7702)
'run quite slow , have kick me out a few time for no reason ., Cosine similarity: 0.7702)
'bad connection every time i use it at sign in error ., Cosine similarity: 0.763)
'bad connection every time i use it at sign in error ., Cosine simil

In [8]:
show(layout)

You are generating standalone HTML/JS output, but trying to use real Python
callbacks (i.e. with on_change or on_event). This combination cannot work.

Only JavaScript callbacks may be used with standalone output. For more
information on JavaScript callbacks with Bokeh, see:

    http://bokeh.pydata.org/en/latest/docs/user_guide/interaction/callbacks.html

Alternatively, to use real Python callbacks, a Bokeh server application may
be used. For more information on building and running Bokeh applications, see:

    http://bokeh.pydata.org/en/latest/docs/user_guide/server.html

