In [1]:
#!jupyter nbconvert --to script review_subset_generator_20180305.ipynb

[NbConvertApp] Converting notebook review_subset_generator_20180305.ipynb to script
[NbConvertApp] Writing 16568 bytes to review_subset_generator_20180305.py


In [7]:
import codecs
import pandas as pd
import itertools as it

import re

import fileinput

from pandas import DataFrame
import os

import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

import en_core_web_sm
nlp = spacy.load('en')

In [8]:
#stars_filepath = 'review_stars_rest_subset.txt'
review_txt_filepath = 'review_text_rest_subset.txt'
business_filepath = 'review_business_rest_subset.txt'
#user_filepath = 'review_user_rest_subset.txt'

In [9]:
def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

In [10]:
def get_data(filepath,review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(filepath),
                          review_number, review_number+1))[0]

Make collased business text file

In [11]:
all_bus_filepath = "collapsed_business_rest_subset.txt"

In [12]:
all_test_filepath = "collapsed_business_rest_subset_test.txt"

Setup and define function for NLP

In [13]:
#helper functions from modern nlp in python
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [14]:
def get_sample_review(review_txt_filepath, review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]

In [15]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import _pickle as pickle

In [16]:
lda_model_filepath = 'lda_model_eat_30'
trigram_dictionary_filepath = 'trigram_dict_eat_30.dict'
trigram_model_filepath = 'trigram_model_all_eat_30'
bigram_model_filepath = 'bigram_model_all_eat_30'

In [17]:
lda = LdaMulticore.load(lda_model_filepath)
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
trigram_model = Phrases.load(trigram_model_filepath)
bigram_model = Phrases.load(bigram_model_filepath)

In [18]:
all_numbers = list(range(0,50))
df_all_numbers = pd.DataFrame(columns =["topic_number"])
for topic_number in all_numbers:
    df_all_numbers = df_all_numbers.append({
     "topic_number": topic_number
      }, ignore_index=True)


In [19]:
all_bus_lda_filepath = "review_bus_lda.txt"

In [21]:
import csv

In [37]:
fileinput.close()

In [40]:
%%time
#function to loop through all businesses and convert review to 
with open('output_file_test.csv', 'w') as file:
    wr = csv.writer(file, dialect = 'excel')
    
    for line in fileinput.input([all_test_filepath]):
        # parse the review text with spaCy
        parsed_review = nlp(line)
 
        # lemmatize the text and remove punctuation and whitespace
        unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
        # apply the first-order and secord-order phrase models
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]
    
        # remove any remaining stopwords
        trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.stop_words.STOP_WORDS]
    
        # create a bag-of-words representation
        review_bow = trigram_dictionary.doc2bow(trigram_review)
    
         # create an LDA representation
        review_lda = lda[review_bow]
    
        # sort with the most highly related topics first
        review_lda = sorted(review_lda, key=lambda review_lda: -review_lda[1])
        
        df = pd.DataFrame(columns=["topic_number", "freq"])

        for topic_number, freq in review_lda:
            df = df.append({
            "topic_number": topic_number,
            "freq":  round(freq, 4)
            }, ignore_index=True)
            #merge with complete topic list and replace na with zero
        df_full = pd.merge(df_all_numbers, df, how='left', on=['topic_number'])
        df_full = df_full.fillna(0)

        
        one_row = df_full['freq']
        
        out = one_row.values.tolist()
        #out.insert(0, bus)
        #out2 = str(out)
        #out3 = re.sub(r"[\[ | \]]", "", out2)
            
            #print(busi)
        wr.writerow(out)    
        #output_file.write(out3)
        #output_file.write('\n')
fileinput.close()



CPU times: user 5min 48s, sys: 2min 4s, total: 7min 52s
Wall time: 1min 52s


In [42]:
test_read = pd.io.parsers.read_csv('output_file_test_minimal_codecs.txt',sep=",")
len(test_read)

751

In [44]:
test_read = pd.io.parsers.read_csv('output_file_test_minimal2.csv',sep=",")
len(test_read)

751

In [45]:
test_read = pd.io.parsers.read_csv('output_file_test_minimal.csv',sep=",")
len(test_read)

740

In [31]:
one_row = df_full['freq']
one_row

0     0.0000
1     0.0000
2     0.0430
3     0.0123
4     0.0000
5     0.0187
6     0.0000
7     0.0121
8     0.0000
9     0.0178
10    0.0000
11    0.0189
12    0.0000
13    0.0102
14    0.0560
15    0.0000
16    0.3007
17    0.0000
18    0.0000
19    0.0000
20    0.0281
21    0.0000
22    0.0000
23    0.0315
24    0.0349
25    0.0000
26    0.0000
27    0.0000
28    0.0000
29    0.0702
30    0.0000
31    0.0274
32    0.0000
33    0.0229
34    0.0000
35    0.0351
36    0.0000
37    0.0508
38    0.0253
39    0.0203
40    0.0000
41    0.0000
42    0.0284
43    0.0000
44    0.0347
45    0.0000
46    0.0000
47    0.0165
48    0.0000
49    0.0000
Name: freq, dtype: float64

In [32]:
out = one_row.values.tolist()
out

[0.0,
 0.0,
 0.0430000014603138,
 0.012299999594688416,
 0.0,
 0.018699999898672104,
 0.0,
 0.01209999993443489,
 0.0,
 0.017799999564886093,
 0.0,
 0.01889999955892563,
 0.0,
 0.010200000368058681,
 0.0560000017285347,
 0.0,
 0.30070000886917114,
 0.0,
 0.0,
 0.0,
 0.02810000069439411,
 0.0,
 0.0,
 0.03150000050663948,
 0.0348999984562397,
 0.0,
 0.0,
 0.0,
 0.0,
 0.07020000368356705,
 0.0,
 0.027400000020861626,
 0.0,
 0.02290000021457672,
 0.0,
 0.035100001841783524,
 0.0,
 0.05079999938607216,
 0.025299999862909317,
 0.0203000009059906,
 0.0,
 0.0,
 0.0284000001847744,
 0.0,
 0.034699998795986176,
 0.0,
 0.0,
 0.016499999910593033,
 0.0,
 0.0]

In [62]:
import csv
RESULT = out#['apple','cherry','orange','pineapple','strawberry']
with open("output.csv",'w') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerow(RESULT)
    wr.writerow(RESULT)

In [61]:
isinstance(out, list)

True

In [56]:
pivoted

Unnamed: 0_level_0,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq
topic_number,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
bus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
9yxHYyXMHjh7LpoolJiH9w,0.0,0.0673,0.0,0.0,0.0574,0.0,0.0,0.096,0.0,0.0572,...,0.0,0.0809,0.0243,0.0,0.0577,0.0,0.0,0.0,0.0,0.0


In [66]:
test_read = pd.io.parsers.read_csv('output_file_test.csv',sep=",")
len(test_read)

9

In [45]:
test_read

Unnamed: 0,[
0,'
1,M
2,a
3,I
4,_
5,E
6,e
7,v
8,M
9,b


In [None]:
%%time
#function to loop through all businesses and convert review to 
if 1 == 1:
    for busi in list(range(0,10)):
        # parse the review text with spaCy
        parsed_review = nlp(all_bus[busi])
 
        # lemmatize the text and remove punctuation and whitespace
        unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
        # apply the first-order and secord-order phrase models
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]
    
        # remove any remaining stopwords
        trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.stop_words.STOP_WORDS]
    
        # create a bag-of-words representation
        review_bow = trigram_dictionary.doc2bow(trigram_review)
    
         # create an LDA representation
        review_lda = lda[review_bow]
    
        # sort with the most highly related topics first
        review_lda = sorted(review_lda, key=lambda review_lda: -review_lda[1])
        
        bus = ''.join([list(x) for x in bus_set][busi])

        df = pd.DataFrame(columns=["topic_number", "freq"])

        for topic_number, freq in review_lda:
            df = df.append({
            "topic_number": topic_number,
            "freq":  round(freq, 4)
            }, ignore_index=True)
            #merge with complete topic list and replace na with zero
        df_full = pd.merge(df_all_numbers, df, how='left', on=['topic_number'])
        df_full = df_full.fillna(0)
        
        df_full['bus'] = bus

        pivoted = df_full.pivot('bus', 'topic_number')
        
        one_row = pivoted.iloc[0]
        
        out = one_row.values.tolist()
        out.insert(0, bus)
        out2 = str(out)
        out3 = re.sub(r"[\[ | \]]", "", out2)
            
            #print(busi)
           
        output_file.write(out3)
        output_file.write('\n')



In [33]:
list(range(0,1000))

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [29]:
test_read = pd.io.parsers.read_csv('output_file.csv',sep=",")
len(test_read)

365557

In [None]:
test_read = pd.io.parsers.read_csv(all_bus_lda_filepath,sep=",")
len(test_read)

In [31]:
test_read.shape

(365557, 1)

In [30]:
%%time
#function to loop through all businesses and convert review to 
if 1 == 1:
    with open(all_bus_lda_filepath, 'w') as f:
        for busi in list(range(0,500)):
            # parse the review text with spaCy
            parsed_review = nlp(all_bus[busi])
 
            # lemmatize the text and remove punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
            # apply the first-order and secord-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
    
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.stop_words.STOP_WORDS]
    
            # create a bag-of-words representation
            review_bow = trigram_dictionary.doc2bow(trigram_review)
    
            # create an LDA representation
            review_lda = lda[review_bow]
    
            # sort with the most highly related topics first
            review_lda = sorted(review_lda, key=lambda review_lda: -review_lda[1])
        
            bus = ''.join([list(x) for x in bus_set][busi])

            df = pd.DataFrame(columns=["topic_number", "freq"])

            for topic_number, freq in review_lda:
                df = df.append({
                "topic_number": topic_number,
                "freq":  round(freq, 4)
                }, ignore_index=True)
            #merge with complete topic list and replace na with zero
            df_full = pd.merge(df_all_numbers, df, how='left', on=['topic_number'])
            df_full = df_full.fillna(0)
        
            df_full['bus'] = bus

            pivoted = df_full.pivot('bus', 'topic_number')
        
            one_row = pivoted.iloc[0]
        
            out = one_row.values.tolist()
            out.insert(0, bus)
            out2 = str(out)
            out3 = re.sub(r"[\[ | \]]", "", out2)
            
            #print(busi)
           
            f.write(out3)
            f.write('\n')



CPU times: user 25min 33s, sys: 14min 32s, total: 40min 6s
Wall time: 7min 53s


In [None]:
test_read = pd.io.parsers.read_csv(all_bus_lda_filepath,sep=",")
len(test_read)

499

In [35]:
50000/500

100.0

In [None]:
for rd in list(range(1,99)):
    st = rd * 500
    ed = (rd + 1) * 500
    with open(all_bus_lda_filepath, 'a') as f:
        for busi in list(range(st,ed)):
            # parse the review text with spaCy
            parsed_review = nlp(all_bus[busi])
    
            # lemmatize the text and remove punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
            # apply the first-order and secord-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
    
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.stop_words.STOP_WORDS]
    
            # create a bag-of-words representation
            review_bow = trigram_dictionary.doc2bow(trigram_review)
    
            # create an LDA representation
            review_lda = lda[review_bow]
    
            # sort with the most highly related topics first
            review_lda = sorted(review_lda, key=lambda review_lda: -review_lda[1])
        
            bus = ''.join([list(x) for x in bus_set][busi])

            df = pd.DataFrame(columns=["topic_number", "freq"])

            for topic_number, freq in review_lda:
                df = df.append({
                "topic_number": topic_number,
                "freq":  round(freq, 4)
                }, ignore_index=True)
            #merge with complete topic list and replace na with zero
            df_full = pd.merge(df_all_numbers, df, how='left', on=['topic_number'])
            df_full = df_full.fillna(0)
        
            df_full['bus'] = bus

            pivoted = df_full.pivot('bus', 'topic_number')
        
            one_row = pivoted.iloc[0]
        
            out = one_row.values.tolist()
            out.insert(0, bus)
            out2 = str(out)
            out3 = re.sub(r"[\[ | \]]", "", out2)
            
           
            f.write(out3)
            f.write('\n')
            
    print(rd)



In [40]:
rd = 3
st = rd * 500
ed = (rd + 1) * 500
print(rd, st, ed)

3 1500 2000


In [124]:
import re
out3 = re.sub(r"[\[ | \]]", "", out2)

In [125]:
out3



"'ToNd6fEn_SvcQc1Fulsidg',0.09790000319480896,0.0,0.0,0.0,0.21660000085830688,0.0,0.10610000044107437,0.016300000250339508,0.0,0.011699999682605267,0.014100000262260437,0.0,0.0,0.0,0.014700000174343586,0.0,0.01209999993443489,0.032999999821186066,0.0,0.0,0.0,0.0,0.0,0.020999999716877937,0.010400000028312206,0.0,0.021900000050663948,0.0,0.0,0.052000001072883606,0.0,0.03269999846816063,0.0,0.013899999670684338,0.0,0.0,0.0,0.04740000143647194,0.0,0.0,0.0,0.15690000355243683,0.0,0.0,0.016100000590085983,0.0,0.0,0.019200000911951065,0.0,0.0"

In [96]:
out2 = str(out)
''.join(out2)

"['1ylA7yyrMMUX1zcu5EqO4Q', 0.0, 0.0, 0.0430000014603138, 0.012299999594688416, 0.0, 0.018699999898672104, 0.0, 0.01209999993443489, 0.0, 0.017799999564886093, 0.0, 0.01889999955892563, 0.0, 0.010200000368058681, 0.0560000017285347, 0.0, 0.30070000886917114, 0.0, 0.0, 0.0, 0.02810000069439411, 0.0, 0.0, 0.03150000050663948, 0.0348999984562397, 0.0, 0.0, 0.0, 0.0, 0.07020000368356705, 0.0, 0.027400000020861626, 0.0, 0.02290000021457672, 0.0, 0.035100001841783524, 0.0, 0.05079999938607216, 0.025299999862909317, 0.0203000009059906, 0.0, 0.0, 0.0284000001847744, 0.0, 0.034699998795986176, 0.0, 0.0, 0.016499999910593033, 0.0, 0.0]"

In [81]:
busi =0
parsed_review = nlp(all_bus[busi])
unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
bigram_review = bigram_model[unigram_review]
trigram_review = trigram_model[bigram_review]
trigram_review = [term for term in trigram_review
    if not term in spacy.lang.en.stop_words.STOP_WORDS]
review_bow = trigram_dictionary.doc2bow(trigram_review)
review_lda = lda[review_bow]

bus = ''.join([list(x) for x in bus_set][busi])

df = pd.DataFrame(columns=["topic_number", "freq"])

for topic_number, freq in review_lda:
    df = df.append({
    "topic_number": topic_number,
    "freq":  round(freq, 4)
    }, ignore_index=True)
#merge with complete topic list and replace na with zero
df_full = pd.merge(df_all_numbers, df, how='left', on=['topic_number'])
df_full = df_full.fillna(0)
        
df_full['bus'] = bus

pivoted = df_full.pivot('bus', 'topic_number')



In [82]:
pivoted

Unnamed: 0_level_0,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq
topic_number,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
bus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1ylA7yyrMMUX1zcu5EqO4Q,0.0,0.0,0.0,0.0624,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0171,0.0269,0.0,0.0592,0.0886,0.0,0.1216


In [83]:
one_row = pivoted.iloc[0]

In [84]:
out = one_row.values.tolist()
out.insert(0, bus)
print(out)

['1ylA7yyrMMUX1zcu5EqO4Q', 0.0, 0.0, 0.0, 0.06239999830722809, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02630000002682209, 0.0674000009894371, 0.0, 0.023099999874830246, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11969999969005585, 0.0, 0.21979999542236328, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06069999933242798, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.022199999541044235, 0.07370000332593918, 0.0, 0.0, 0.0, 0.017100000753998756, 0.026900000870227814, 0.0, 0.05920000001788139, 0.08860000222921371, 0.0, 0.12160000205039978]


In [85]:
all_bus[1]

"This place is so good. I work nearby and come here often for lunch.  Their menu is 7.49 for one veg & one meat & rice or noodles, 7.99 for two veg & one meat & rice or noodles, 8.49 for one veg & two meat & rice or noodles. They also offer salads (7.49 I believe).  The rice is amazing. They have a few different kinds, but I have only tried the Jambalayan kind. It's sort of spicy but SO delicious. It is, in fact, so good, I am not willing to try the other kinds. Instead of rice, you can also get noodles.  Their vegetables include home fries, corn, mixed veggies, zucchini, mashed potatoes, and a few others. I have tried the home fries, mixed veggies, and zucchini and they are all good, but I personally like the home fries the most.  Their meats change on a daily basis, but they normally have bourbon chicken, blackened chicken, honey glazed chicken, spicy beef, rainbow shrimp, a type of fish (I forget), and a few other chicken flavours. The bourbon chicken is amazing. Ask for extra sauce

In [103]:
''.join([list(x) for x in bus_set][1])

'DGwDXazeFcD7DByweszpFA'