# Dynamic Topic Models Project on Amazon music data
# Soomin Cho, Blair Alexander


In [7]:
import logging
import os
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
import gzip




In [8]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


review_file = "../../../data/amazon_reviews/dm/reviews_Digital_Music_h10k.json.gz"

df = getDF(review_file)
print (df['reviewText'])

0       The anthemic title track begins &quot;The Memo...
1       Just when I thought Enya couldn't possibly get...
2       Nice CD for easy listening.  My husband and I ...
3       I really liked this CD, especially the differe...
4       Enya's richly chorded style has smitten me onc...
5       It's hard to believe "Memory of Trees" came ou...
6       I am an enya fan and like her cd's. on this cd...
7       I'm not a huge, know it all Enya fan.  But I d...
8       This is my very favorite Enya album to date. E...
9       I love, love love love love The Memory of Tree...
10      I only recently discovered this album by Enya,...
11      Ok, first off, Enya rocks.  If you don't know ...
12      A clasically-styled and introverted album, Mem...
13      Okay, so I'm spreading my #1 rating around.  &...
14      I never thought Enya would reach the sublime h...
15      Certainly of very good sound quality.  A coupl...
16      THE MEMORY OF TREES, combines esoteric grandeu...
17      I love

In [None]:
#see the first few rows of the dataset
df.head()

## Data preprocessing (tokenize, remove stopwords, etc.)


In [9]:
import nltk
import string
from nltk.tree import Tree
stopwords_list = nltk.corpus.stopwords.words('english')
# Create a list for the tokenized sentences:
tok_sentences = list()
# Create a list for the tokenized reviews:
tok_reviews = list()
# Create a list for the sentence assigned POS tags:
pos_sentences = list()
# Create a translation table for removing the punctuation marks:
translator = str.maketrans('', '', string.punctuation)

all_words = list()
r_count = 0
for review in df['reviewText']:
    r_count += 1
    if (r_count % 1000 == 0):
        print(r_count)
    sentences = nltk.sent_tokenize(review)
    review_words= list()
    for sentence in sentences:
        sent_words = nltk.word_tokenize(sentence)
        sent_words_tok = [word.lower() for word in sent_words if word.lower() not in stopwords_list and word.isalpha()]
        tok_sentences.append(sent_words_tok) #the 
        for words in sent_words_tok:
            all_words.append(words)
            review_words.append(words)
    tok_reviews.append(review_words)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [16]:
#find out the date range of documents
years = [int(datetime.split()[2]) for datetime in df['reviewTime']]
years_argsort = np.argsort(years)

years_argsort
print(np.min(years))
print(np.max(years))

1998
2014


In [22]:
#add reviews for each year to a dictionary
from collections import defaultdict
review_year_dict = defaultdict(list)
for year,review in zip(years,tok_reviews):
    review_year_dict[year].append(review)




In [23]:
print(review_year_dict[1998])

[['quot', 'offers', 'enya', 'heart', 'selections', 'special', 'meaning', 'due', 'loss', 'one', 'special', 'really', 'read', 'words', 'way', 'home', 'see', 'thing', 'heart', 'one', 'love', 'concerned', 'rather', 'different', 'type', 'composition', 'enya', 'offered', 'quot', 'anywhere', 'quot', 'obligatory', 'swing', 'choirs', 'future', 'la', 'sonadora', 'really', 'moving', 'amp', 'spiritual', 'great', 'tracks', 'cd'], ['enya', 'fan', 'sure', 'cd', 'offers', 'nice', 'quot', 'mix', 'quot', 'lack', 'better', 'word', 'styles', 'quot', 'pax', 'deorum', 'quot', 'chilling', 'latin', 'refrains', 'quot', 'la', 'sonadora', 'quot', 'spanish', 'torreador', 'ballad', 'feeling', 'absolutely', 'beautiful', 'refrains', 'quot', 'athair', 'ar', 'seamh', 'quot', 'display', 'vocal', 'writing', 'talents', 'deny', 'quot', 'china', 'roses', 'quot', 'one', 'smooth', 'sweet', 'vocal', 'pieces', 'grace', 'cd', 'memory', 'trees', 'stellar', 'enya', 'disc'], ['bad', 'cars', 'lumped', 'classic', 'rock', 'zeppelin',

In [28]:
#the below variable would be used as a parameter in DTM: how many documents per year?
time_seq = [len(review_year_dict[year]) for year in range(1998,2015)]

In [33]:
#sort reviews by year
sorted_reviews = []
for y in range(1998,2015):
    for review in review_year_dict[y]:
        sorted_reviews.append(review)#review_year_dict[y])


['enya', 'fan', 'sure', 'cd', 'offers', 'nice', 'quot', 'mix', 'quot', 'lack', 'better', 'word', 'styles', 'quot', 'pax', 'deorum', 'quot', 'chilling', 'latin', 'refrains', 'quot', 'la', 'sonadora', 'quot', 'spanish', 'torreador', 'ballad', 'feeling', 'absolutely', 'beautiful', 'refrains', 'quot', 'athair', 'ar', 'seamh', 'quot', 'display', 'vocal', 'writing', 'talents', 'deny', 'quot', 'china', 'roses', 'quot', 'one', 'smooth', 'sweet', 'vocal', 'pieces', 'grace', 'cd', 'memory', 'trees', 'stellar', 'enya', 'disc']


In [34]:
#use Blei's dynamic topic modeling package
class DTMcorpus(corpora.textcorpus.TextCorpus):

    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)

corpus = DTMcorpus(sorted_reviews)

In [None]:
dtm_path = "C:/Users/UserAcc/AnacondaProjects/dtm-win64.exe"
model = DtmModel(dtm_path, corpus, time_seq, num_topics=10,
                 id2word=corpus.dictionary, initialize_lda=True)

In [None]:
#show top 10 topicsdtm_path = "C:/Users/UserAcc/AnacondaProjects/dtm-win64.exe"
model = DtmModel(dtm_path, corpus, time_seq, num_topics=10,
                 id2word=corpus.dictionary, initialize_lda=True)

#show top 10 topics for year 0 (1998) - topic #1
model.show_topic(topicid=1, time=0, topn=10)

#make a dictionary of all topics over all years
list_words = defaultdict(lambda: defaultdict(list))
for topicno in range(10): #topics
    for y in range(17): #years
        list_words[topicno][y].append(model.show_topic(topicid=topicno, time=y, topn=10))

list_words