In [362]:
import numpy
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
import json
import random
import string
import math

Importing the dataset

In [363]:
with open('reviewSelected100.json') as f:
    data = json.loads("[" + 
        f.read().replace("}\n{", "},\n{") + 
    "]")

In [364]:
# Create a set of all businesses
business_set = set()
for review in data:
    business_set.add(review['business_id'])
print("Number of businesses: " + str(len(business_set)))

Number of businesses: 153


## Finding the Adjective Phrases of a Randomly Selected Business

Choose a random business

In [365]:
business_set = list(business_set)
selected_business_id = business_set[random.randrange(len(business_set))]

In [367]:
selected_business_id = "R4R7ttLXfKKWM0VEMoaW4w"
selected_business_id

'R4R7ttLXfKKWM0VEMoaW4w'

Extract out the adjective phrases from all the reviews belonging to this business

In [368]:
# Get all the reviews of the business
selected_business_reviews = []
for review in data:
    if review['business_id'] == selected_business_id:
        selected_business_reviews.append(review['text'])

Define the grammar rules for the structure of Adjective Phrases (ADJP), as well as other phrase types. 

In [369]:
# Final
grammar = ('''
    VP: { <TO> <VB> <PRP> <VB.?> | <TO> <VB> <VB.?>? <DT>? <JJ.?>? <NN.?>? }
    NC: { <IN>? ((<PRP.>|<DT>) <NN.?> <JJ.?>? <MD> <VB.?>) | <IN>? ((<DT>|<PRP.>) <JJ.?>? <NN.?> <VB.?> (<NN.?>|<VB.>)) | <IN>? <PRP> <VB.?> (<DT>|<PRP.?>) <NN.?> }
    PP: { (<IN> <DT>? <JJ.?>? <NN.?>) }
    ADJP: { (<IN> <DT>? <NN.?>) | (<PP> <IN> <NN.?>) | <RB.?>* <JJ.?> <PP> | <RB.?>* <JJ.?> <VP> | <RB.?>* <JJ.?> <NC> | (<RB.?>* <JJ.?> <,>? <CC>?)+ | (<JJ.?> <,>? <CC>?)+ | <JJ.?>* | <RB.?>+ <VB(G|N)> }
    ''')

# Preposition + noun | PP + noun | (adv) adj + PP | (adv) adj + VP | (adv) adj + NC | Adverb Adjective(s) | Adjective chain(s) | Adjective(s) | Some VBG/N

In [370]:
# preprocess review into pos tags for parsing
def preprocess_review_to_pos(text):
    # tokenize the review into sentences
    text = sent_tokenize(text)
    
    # tokenize the sentences into words
    tokens = []
    for sentence in text:
        tokens.append(word_tokenize(sentence))
    
    # generate the pos tags for each tokenized sentence
    results = []
    for lst in tokens:
        results.append(nltk.pos_tag(lst))
        
    return results # output is a list of lists of pos tags

In [371]:
chunkParser = nltk.RegexpParser(grammar)

Parse each review, analysing its phrase structure and extracting out only the adjective phrases

In [372]:
adjective_phrases = []

for review in selected_business_reviews:
    # preprocess the review text
    tagged_tokens = preprocess_review_to_pos(review)
    # parse the review
    for tags in tagged_tokens:
        tree = chunkParser.parse(tags)
        # go through the parse tree and append adjective phrases (ADJP)
        for subtree in tree.subtrees():
            if subtree.label() == "ADJP":
                adjective_phrase = []
                for leaf in subtree.leaves():  # each leaf is a tuple (token, pos_tag)
                    adjective_phrase.append(leaf[0]) # append the token
                adjective_phrase = " ".join(adjective_phrase) # merge the tokens of the phrase
                adjective_phrases.append(adjective_phrase)

In [373]:
adjective_phrases

['nicest',
 'very friendly',
 'various',
 'chill',
 'consistently good',
 'not rotated',
 'very clean',
 'friendly , and',
 'particular',
 'rewards',
 'Good',
 'frozen',
 'own',
 'officially yogurt',
 'fro-yo',
 'Last',
 'tasty first-time',
 'very friendly ,',
 'added',
 'inhale',
 'clean and tasty',
 'decent frozen',
 'helpful ,',
 'vast and plentiful',
 'First',
 'frozen',
 'very nice',
 'Clean and easy',
 'never otherwise thought',
 'first',
 'first',
 'Too exhausted',
 'great',
 'Very white and clean',
 'Enormous',
 'fresh',
 'so happy to be able',
 'friendly and',
 'short',
 'more to put',
 'neatest',
 'tiny',
 'own',
 'always great and most',
 'crazy busy and',
 'other',
 'really bad for the blonde',
 'great and',
 'such',
 'hard',
 'Best',
 'Friendly',
 'always clean and',
 'to-go ,',
 'full',
 'so unprofessional',
 'actually closed',
 'mediocre',
 'big',
 'friendly',
 'fresh',
 'high',
 'hopeful to find a great',
 'froyo',
 'few unfortunate',
 'great',
 'so many',
 'super frien

### Now, we need to determine which adjective phrases are indicative
We perform TF-IDF in order to determine which phrases are unique to the business
- Find the TF of each adjective phrase
- Create a biword/word index to store the Document Frequency of each biword/term
- Compute the TF-IDF of each adjective phrase

First, preprocess the entire dataset before performing TF-IDF

In [374]:
# Sentence tokenizer
def sentence_tokenize(text):
    return sent_tokenize(text)

In [375]:
# Tokenize the sentences -> words
def tokenize(sentences):
    result = []
    for sentence in sentences:
        result.append(word_tokenize(sentence))
    return result

In [376]:
# Lowercase all the text
def convert_to_lowercase(text):
    result = []
    for tokens in text:
        lowercase_tokens = []
        for token in tokens:
            lowercase_tokens.append(token.lower())
        result.append(lowercase_tokens)
    return result

In [377]:
# Remove punctuations except hyphens
def remove_punctuation(sentences):
    punctuation = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'
    result = []
    for sentence in sentences:
        new_sentence = []
        for token in sentence:
            new_token = token.translate(str.maketrans('', '', punctuation))
            if len(new_token) > 0:
                new_sentence.append(new_token)
        if len(new_sentence) > 0:
            result.append(new_sentence)
    return result

In [378]:
# Remove stopwords
def remove_stopwords(sentences):
    result = []
    stopwords = nltk.corpus.stopwords.words('english')
    for sentence in sentences:
        filtered = []
        for word in sentence:
            if word not in stopwords:
                filtered.append(word)
        result.append(filtered)
    return result

In [379]:
# Stemming
def stemming(sentences):
    result = []
    ps = PorterStemmer()
    for sentence in sentences:
        result.append([ps.stem(word) for word in sentence])
    return result

In [380]:
# preprocessing
def preprocess(text):
    text = sentence_tokenize(text)
    text = tokenize(text)
    text = convert_to_lowercase(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = stemming(text)
    return text
# Outputs a list of lists containing tokens

Create biword/uniword index: Dictionary with bigram/word as the key and the value being the set of businesses that contain that bigram/word

In [381]:
# Generate bigrams given an array of arrays of strings
def generate_bigrams(sentences):
    bigrams = []
    for sentence in sentences:
        for i in range(len(sentence) - 1):
            bigrams.append((sentence[i], sentence[i+1]))
    return bigrams

In [382]:
# Create the index
index = {}
for review in data:
    b_id = review['business_id']
    preprocessed_text = preprocess(review['text']) # Array of arrays of tokens
    bigrams = generate_bigrams(preprocessed_text)
    
    # index the unigrams
    for sentence in preprocessed_text:
        for word in sentence:
            try:
                index[word].add(b_id)
            except:
                index[word] = {b_id}
    # index the bigrams
    for bigram in bigrams:
        try:
            index[bigram].add(b_id)
        except:
            index[bigram] = {b_id}

In [383]:
# Since we don't need to list of business ids in the index, we convert it to the count of business ids
for key in index:
    index[key] = len(index[key])

In [384]:
index

{'mother': 56,
 'birthday': 72,
 'parti': 104,
 '102916': 1,
 'great': 153,
 'time': 153,
 'food': 136,
 'music': 90,
 'waiter': 64,
 'thank': 142,
 'lyle': 1,
 ('mother', 'birthday'): 1,
 ('birthday', 'parti'): 14,
 ('parti', '102916'): 1,
 ('great', 'time'): 44,
 ('food', 'music'): 3,
 ('music', 'waiter'): 1,
 ('waiter', 'great'): 5,
 ('thank', 'lyle'): 1,
 'good': 153,
 'korean': 17,
 'grill': 89,
 'near': 123,
 'eaton': 2,
 'centr': 10,
 'marin': 29,
 'got': 153,
 'beef': 89,
 'ox': 4,
 'liver': 8,
 'salmon': 41,
 'fish': 83,
 'fillet': 16,
 'chicken': 108,
 'pork': 56,
 'belli': 33,
 'bland': 86,
 'meh': 65,
 'realli': 153,
 'flavour': 39,
 'fun': 113,
 'place': 153,
 'eat': 136,
 'date': 93,
 'group': 111,
 'friend': 151,
 'even': 153,
 'alon': 85,
 'judgment': 11,
 'staff': 153,
 'attent': 126,
 'nice': 153,
 'consider': 37,
 'bigger': 91,
 'like': 153,
 'seat': 126,
 'second': 149,
 'floor': 88,
 'way': 153,
 'caution': 12,
 'smell': 97,
 'bbq': 54,
 ('good', 'korean'): 3,
 ('k

#### Calculating TF-IDF

###### Term Frequency
Term frequency is calculated by number of occurrences of the term divided by the number of terms in the entire document (in this case, the business)

In [385]:
# Find word/biword count stored in a dictionary { token : count }
token_count = {}
for review in data:
    if review['business_id'] == selected_business_id:
        # break up review text into sentences
        sentences = preprocess(review['text'])
        # find the count of each unigram
        for sentence in sentences:
            for word in sentence:
                try:
                    token_count[word] += 1
                except:
                    token_count[word] = 1
        # find the count of each bigram
        bigrams = generate_bigrams(sentences)
        for bigram in bigrams:
            try:
                token_count[bigram] += 1
            except:
                token_count[bigram] = 1

In [386]:
token_count

{'connor': 1,
 'nicest': 1,
 'guy': 3,
 'work': 19,
 'friendli': 38,
 'experi': 4,
 'explain': 4,
 'variou': 4,
 'yogurt': 127,
 'make': 20,
 'chill': 2,
 'environ': 2,
 ('connor', 'nicest'): 1,
 ('nicest', 'guy'): 1,
 ('guy', 'work'): 1,
 ('work', 'friendli'): 1,
 ('friendli', 'experi'): 1,
 ('experi', 'explain'): 1,
 ('explain', 'variou'): 1,
 ('variou', 'yogurt'): 1,
 ('make', 'chill'): 1,
 ('chill', 'environ'): 1,
 'great': 46,
 'locat': 8,
 'get': 42,
 'froyo': 14,
 'north': 4,
 'glendal': 6,
 '10': 4,
 'discount': 1,
 'midwestern': 1,
 'student': 1,
 'flavor': 66,
 'top': 85,
 'consist': 4,
 'good': 38,
 '-': 7,
 'problem': 2,
 'rotat': 1,
 'frequent': 5,
 'otherwis': 3,
 'clean': 44,
 'insid': 6,
 'employe': 20,
 'sampl': 30,
 'commit': 1,
 'particular': 1,
 'cup': 15,
 'also': 19,
 'reward': 3,
 'program': 4,
 'coupl': 5,
 'visit': 9,
 ('great', 'locat'): 1,
 ('locat', 'get'): 1,
 ('get', 'froyo'): 2,
 ('froyo', 'north'): 1,
 ('north', 'glendal'): 2,
 ('glendal', '10'): 1,
 ('1

In [387]:
# calculate the TF
def computeTF(ngram, count_dict):
    term_frequency = count_dict[ngram]
    # if ngram is a single word
    if type(ngram) == str:
        total_word_count = 0
        for i in count_dict:
            if type(i) == str:
                total_word_count += count_dict[i]
        return term_frequency / total_word_count
    # if ngram is a bigram
    elif type(ngram) == tuple:
        total_biword_count = 0
        for i in count_dict:
            if type(i) == tuple:
                total_biword_count += count_dict[i]
        return term_frequency / total_biword_count

###### Inverse Document Frequency
For the Inverse Document Frequency (IDF), we will be using a smoothed IDF, i.e. a constant value of 1 will be added to the numerator and denominator, so as to prevent zero divisions. IDF(t) = log ( (1+n) / (1+df(t)) ) + 1

In [388]:
# calculate IDF
def computeIDF(ngram, count_dict, N): # N is the total number of businesses
    df = count_dict[ngram]
    idf = math.log((1+N) / (1+df)) + 1
    return idf

###### TF-IDF
Now we are ready to calculate the TF-IDF. In order to calculate the TF-IDF of each adjective phrase, we first need to calculate the TF-IDF values of each individual unigram/bigram in the phrase, and sum up these values.

In [389]:
# compute TFIDF of a single token (word or biword)
def computeTFIDF(ngram, business_word_count_dict, total_word_count_dict, N):
    tf = computeTF(ngram, business_word_count_dict)
    idf = computeIDF(ngram, total_word_count_dict, N)
    return tf * idf

In [390]:
# compute TFIDF of an adjective phrase
def computePhraseTFIDF(adjective_phrase, business_word_count_dict, total_word_count_dict, N):
    sum_of_tfidf = 0
    tokens = []
    # preprocess the adjective phrase
    preprocessed_phrase = preprocess(adjective_phrase)
    
    # break down adjective phrase into bigrams if it is more than one word
    if len(preprocessed_phrase[0]) == 1:
        tokens = preprocessed_phrase[0]
    else:
        tokens = generate_bigrams(preprocessed_phrase)
    
    # Loop through tokens to calculate TFIDF and sum them up
    for token in tokens:
        sum_of_tfidf += computeTFIDF(token, business_word_count_dict, total_word_count_dict, N)
    
    return sum_of_tfidf

In [391]:
print("Total number of adjective phrases extracted: " + str(len(adjective_phrases)))
print("Total number of unique adjective phrases: " + str(len(set(adjective_phrases))))

Total number of adjective phrases extracted: 683
Total number of unique adjective phrases: 453


In [392]:
# take only the unique adjective phrases
adjective_phrases = list(set(adjective_phrases))

In [393]:
# calculate the tfidf of each adjective phrase
adjp_tfidf = {}

for adjp in adjective_phrases:
    adjp_tfidf[adjp] = computePhraseTFIDF(adjp, token_count, index, len(business_set))

In [394]:
adjp_tfidf

{'delicious': 0.003565643356855886,
 'always pleasant': 0.0009869535850585242,
 'Overall ,': 0.0007417885816762288,
 'constantly cleaning': 0.0014848028401927434,
 'always great and most': 0.00153984261771648,
 'Always good': 0.0005049860786989504,
 'so friendly and': 0.009095733145970359,
 'clean , but': 0.011093847762115198,
 'bad': 0.001939924405419653,
 'always clean and': 0.0022545830165921856,
 'very clean and modern': 0.0008901748425555467,
 'Somewhat annoying': 0.0012922084582644453,
 'other fro-yo': 0.014377778153821309,
 'not ashamed': 0.0009131338941920119,
 'back many': 0.0009647133217427319,
 "n't worth": 0.0007943592031302259,
 'caramel fro-yo & nutella': 0.002969605680385487,
 'clean , cool': 0.0011795479669868224,
 'next': 0.001426872770511296,
 'yummy ,': 0.001690644572618751,
 'mixed': 0.00029320607792636687,
 'always very pleased': 0.0012302069158042592,
 'few of the menu': 0.00028178890332794575,
 'friendly , and': 0.009095733145970359,
 'sure to sign': 0.0012922084

In [395]:
# Convert the adjective phrases' TFIDF values into a pandas dataframe
adjp_key = []
tfidf_value = []
for key in adjp_tfidf:
    adjp_key.append(key)
    tfidf_value.append(adjp_tfidf[key])

adjp_tfidf_dict = {
    "Adjective Phrase": adjp_key,
    "TF-IDF": tfidf_value
}
    
adjp_tfidf_df = pd.DataFrame(adjp_tfidf_dict)

In [396]:
adjp_tfidf_df

Unnamed: 0,Adjective Phrase,TF-IDF
0,delicious,0.003566
1,always pleasant,0.000987
2,"Overall ,",0.000742
3,constantly cleaning,0.001485
4,always great and most,0.001540
...,...,...
448,So many great + fresh,0.002054
449,sweet,0.003499
450,same as every other place,0.001067
451,"so bad ,",0.001940


In [397]:
import plotly.express as px

fig = px.box(adjp_tfidf_df, x="TF-IDF", hover_data=['Adjective Phrase'], points="all", title="TF-IDF values for Adjective Phrases of Business ID " + selected_business_id)
fig.show()

In [398]:
adjp_tfidf_df.describe()

Unnamed: 0,TF-IDF
count,453.0
mean,0.002587
std,0.003374
min,0.0
25%,0.000871
50%,0.001412
75%,0.002394
max,0.020346


##### Indicative Adjective Phrases
Let's consider the indicative adjective phrases of the business b1 to be the phrases that have a TF-IDF value that is the greater than or equal to the upper fence of all the TF-IDF value. The upper fence can be calculated as 1.5 * IQR + Q3 (3rd Quartile + 1.5 x Interquartile Range).

In [399]:
# Find the upper fence value
q3 = adjp_tfidf_df.describe()['TF-IDF']["75%"]
q1 = adjp_tfidf_df.describe()['TF-IDF']["25%"]
iqr = q3 - q1
upper_fence = q3 + 1.5 * iqr
print("Upper fence TF-IDF: " + str(upper_fence))

Upper fence TF-IDF: 0.004676834025317919


In [400]:
# Extract out the phrases that have tf-idf value >= upper fence
indicative_df = adjp_tfidf_df[adjp_tfidf_df["TF-IDF"] >= upper_fence]

In [401]:
# Sort in descending order
indicative_df = indicative_df.sort_values("TF-IDF", ascending=False)
indicative_df = indicative_df.reset_index(drop=True)

In [402]:
indicative_df

Unnamed: 0,Adjective Phrase,TF-IDF
0,topping,0.020346
1,oatmeal,0.019039
2,froyo but,0.017792
3,froyo,0.017792
4,other froyo,0.017792
5,only frozen,0.016389
6,frozen,0.016389
7,other fro-yo,0.014378
8,fro-yo,0.014378
9,sample,0.013288
