In [1]:
import pandas as pd
import spacy
import ast
import re
from pprint import pprint
import json
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from sklearn import linear_model
from sklearn.feature_extraction.text import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, r2_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from scipy import sparse
import nltk; nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yubozhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
reviews = []
with open('scraped_reviews_spotify.txt', 'r') as f: #, encoding='utf-8'
    reviews = f.read()

reviews = re.sub('\]', '', reviews)
reviews = re.sub('\[', '', reviews)
reviews = re.sub('\n', '', reviews)
reviews = re.sub("'", '"', reviews)
reviews = re.sub('    ', ' ', reviews)
reviews = re.sub(' {', '{', reviews)
reviews = re.sub(' }', '}', reviews)
reviews = re.sub('{ ', '{', reviews)
reviews = re.sub('} ', '}', reviews)
reviews = re.sub('}  {', '}, {', reviews)
reviews = re.sub('}{', '}, {', reviews)
reviews = re.sub('id:', '"id":', reviews)
reviews = re.sub('userName:', '"userName":', reviews)
reviews = re.sub('userImage:', '"userImage":', reviews)
reviews = re.sub('date:', '"date":', reviews)
reviews = re.sub('url:', '"url":', reviews)
reviews = re.sub('score:', '"score":', reviews)
reviews = re.sub('title:', '"title":', reviews)
reviews = re.sub('text:', '"text":', reviews)
reviews = re.sub('replyDate:', '"replyDate":', reviews)
reviews = re.sub('replyText:', '"replyText":', reviews)
reviews = re.sub(': undefined', ': "undefined"', reviews)

In [3]:
review_strings = [r+'}' for r in reviews.split('},')]
review_strings[-1] = review_strings[-1][:-1]

In [4]:
review_list = []
for r in review_strings:
    try:
        review_list.append(json.loads(r))
    except:
        pass

In [5]:
raw_corpus = []
for i in range(1000): # change to larger number
    raw_corpus.append(review_list[i][u'text'])

In [6]:
data = raw_corpus

In [7]:
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]
data = [re.sub("\"", "", sent) for sent in data]
data = [re.sub("_", "", sent) for sent in data]

data = [re.sub("good'", "", sent) for sent in data]
data = [re.sub("app", "", sent) for sent in data]
data = [re.sub("nice", "", sent) for sent in data]
pprint(data[1:10])


[u'Superb!',
 u'Too many ads! Blips of music with numerous ad interruptions, not even worth the effort!',
 u'I enjoyed it, but it doesnt actually let me listen to what I want to hear. Trying to get Gohans Anger theme, and giving me random songs, also, I dont want what Spotify gives me it, it freaking sucks.',
 u'It used to be an excellent . Now its constantly crashing down on my Note 8. Please fix the bug.',
 u'I like the  but it needs some dude Jupiter (Please)',
 u'Great , Ive been using it for 3 years, I highly recommend it\U0001f601',
 u'I have used spotify for years & when i use premium, no complaints but right now i do not have premium & when i try to listen to specific groups, it no longer stays just in the selected group, instead it pulls songs from any/all groups. I dont like that at all.. Whats the point in creating groups if u cant play from a specified group? It wasnt always like this.. Is this an error/bug? Will it be fixed? PLEASE!',
 u'Love the  think it could be cheaper

In [8]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str([sentence]), deacc=True)) 


In [9]:
data_words = list(sent_to_words(data))

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) 
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
#print(trigram_mod[bigram_mod[data_words[0][:30]]])
#print(list(sent_to_words(data)))



In [10]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        if sent != []:
            doc = nlp(" ".join(sent)) 

            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
 
    return texts_out

In [11]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized)

[[u'good', u'iheartradio'], [u'superb'], [u'many', u'ad', u'blip', u'music', u'numerous', u'ad', u'interruption', u'even', u'worth', u'effort'], [u'enjoy', u'do', u'not', u'actually', u'let', u'listen', u'want', u'hear', u'try', u'get', u'gohan', u'anger', u'theme', u'give', u'random', u'song', u'also', u'do', u'not', u'want', u'spotify', u'give', u'freaking', u'suck'], [u'use', u'excellent', u'constantly', u'crash', u'note', u'please_fix', u'bug'], [u'need', u'dude', u'jupiter'], [u'great', u'have', u'use', u'year', u'highly', u'recommend'], [u'use', u'spotify', u'year', u'premium', u'complaint', u'right', u'premium', u'try', u'listen', u'specific', u'group', u'longer', u'stay', u'select', u'group', u'instead', u'pull', u'song', u'group', u'do', u'not', u'like', u'what', u's', u'point', u'create', u'group', u'can', u'not', u'play', u'specify', u'group', u'be', u'not', u'always', u'error', u'bug', u'fix'], [u'love', u'think', u'could', u'cheaper', u'still', u'great'], [u'good'], [u'go'

In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus)

[[(0, 1), (1, 1)], [(2, 1)], [(3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)], [(12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2)], [(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)], [(39, 1), (40, 1), (41, 1)], [(38, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1)], [(15, 1), (23, 1), (24, 3), (26, 1), (27, 1), (30, 1), (32, 1), (38, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 5), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 2), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1)], [(42, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1)], [(0, 1)], [(9, 1), (74, 1), (75, 1)], [(23, 1), (76, 1), (77, 1), (78, 1)], [(26, 1), (71, 1), (79, 1), (80, 1), (81, 1)], [(0, 1), (82, 1)], [(26, 1), (42, 1), (73, 1), (79, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (8

In [13]:
id2word[1]

u'iheartradio'

In [14]:
# Human readable format of corpus (term-frequency)
#[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [27]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


In [28]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  u'0.065*"love" + 0.053*"not" + 0.052*"song" + 0.052*"music" + 0.039*"spotify" + 0.035*"great" + 0.031*"do" + 0.026*"listen" + 0.022*"play" + 0.020*"playlist"'),
 (1,
  u'0.058*"crash" + 0.045*"update" + 0.039*"keep" + 0.029*"second" + 0.019*"new" + 0.017*"time" + 0.017*"say" + 0.016*"ever" + 0.015*"work" + 0.015*"day"'),
 (2,
  u'0.033*"amazing" + 0.030*"use" + 0.029*"awesome" + 0.027*"be" + 0.026*"pay" + 0.025*"have" + 0.018*"find" + 0.016*"thank" + 0.013*"download" + 0.012*"problem"'),
 (3,
  u'0.124*"good" + 0.071*"music" + 0.032*"would" + 0.026*"go" + 0.024*"really" + 0.018*"app" + 0.016*"free" + 0.014*"cool" + 0.012*"streaming" + 0.010*"worth"')]


In [29]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [30]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


('\nPerplexity: ', -6.413305381851651)
('\nCoherence Score: ', 0.4960519494995757)
