In [1]:
import pandas as pd
import json
import numpy as np
import random
import nltk
import math
import spacy
import re
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# load data 
data_file = open("/content/drive/MyDrive/yelp_academic_dataset_review.json")
data = []
for line in data_file:
  data.append(json.loads(line))
df = pd.DataFrame(data)
data_file.close()

In [5]:
# extract business_id
series_id_total = df['business_id']
series_id_once = series_id_total.drop_duplicates()

# select a business b1 randomly
r = random.randint(0, len(series_id_once)-1)
b1 = df.loc[r, "business_id"]
print("business id is " + b1)
df_b1_related = df[df['business_id'] == b1]
reviews_selected = df_b1_related['text'].tolist()

business id is QxUVzln7g5ehHPtVy2hOng


In [6]:
def get_AP_from_tree(tree, tokens):
  adj_phrase = []
  st = 0
  for subtree in tree:
    if isinstance(subtree, nltk.tree.Tree):
      if subtree.label() == 'AP':
        ed = st + len(subtree.leaves())
        if(ed-st > 1):        
          for i in range(st,ed-1):
            adj_phrase.append(tokens[i]+" "+tokens[i+1])
        else:
          adj_phrase.append(tokens[st])    
      st += len(subtree.leaves())
    else:
      st += 1
  return adj_phrase

def extract_adj_phrase(review):
  doc = nlp(review)
  tagged = []
  tokens = []
  for token in doc:
    if(token.text == '\n'):
      continue
    _token = token.text.lower()
    _tag = (_token, token.tag_)
    tagged.append(_tag)
    tokens.append(_token)

  grammar = r"""
    AP: {<RB><JJ|JJR|JJS><IN|TO><DT>?<NN|NNS|NNP|NNPS>}
      {<JJ|JJR|JJS><IN|TO><DT>?<NN|NNS|NNP|NNPS>}
      {<RB>?<JJ|JJR|JJS>}
  """
  cp = nltk.RegexpParser(grammar)
  tree = cp.parse(tagged)
  return get_AP_from_tree(tree, tokens)

In [7]:
# extract adjective phrases form the reviews of business b1
adj_phrases = []
adj_selected = []
for review in reviews_selected:
  adj_phrases += extract_adj_phrase(review)

_count = pd.value_counts(adj_phrases)
adj_selected_count = _count[0:min(len(_count),50)]
print(adj_selected_count)


good          75
other         69
great         44
more          36
personal      36
nice          31
free          31
few           28
new           27
front         25
old           24
friendly      22
hot           21
better        21
last          18
high          18
only          17
clean         17
many          15
same          15
several       15
indoor        14
first         14
bad           13
most          12
little        12
dirty         11
favorite      11
fine          11
different     10
disgusting    10
real          10
cheap         10
worst         10
hard           9
open           9
of the         9
able           9
sure           9
not sure       9
helpful        9
own            8
less           8
long           8
best           8
squat          8
amazing        8
close          7
negative       7
enough         7
dtype: int64


In [8]:
adj_selected = adj_selected_count.index.tolist()
adj_sampled_count = pd.Series([0]*len(adj_selected), index = adj_selected)
adj_score = pd.Series([0.0]*len(adj_selected), index = adj_selected)


In [9]:
# count the frequency of adjective phrase in sampled reviews 
sample_num = 20000

reviews_total_series = df['text']
reviews_sampled = reviews_total_series.sample(sample_num).tolist()

adj_sampled = []
for review in reviews_sampled:
  _adj_phrases = extract_adj_phrase(review)
  for phrase in _adj_phrases:
    if phrase in adj_selected:
      adj_sampled_count[phrase] += 1

print(adj_sampled_count)

good          6140
other         3613
great         7759
more          2112
personal       224
nice          2430
free          1051
few           1952
new           1830
front          397
old            897
friendly      1687
hot           1099
better        1157
last          1373
high           662
only          1310
clean          695
many          1013
same          1146
several        731
indoor          89
first         2202
bad            910
most           756
little        2683
dirty          234
favorite      1129
fine           467
different     1127
disgusting      64
real           505
cheap          408
worst          458
hard           534
open           557
of the         409
able           801
sure          1179
not sure       405
helpful        486
own            686
less           454
long           808
best          2897
squat            2
amazing       2188
close          169
negative       162
enough         462
dtype: int64


In [10]:
# calculate the scores of adjective phrases
for index in adj_selected:
  p_b1 = adj_selected_count[index] / float(len(reviews_selected))
  p_C = adj_sampled_count[index] / float(sample_num)
  if(p_C != 0):
    adj_score[index] = p_b1 * math.log(p_b1/p_C) 
  else:
    adj_score[index] = -100
print("Indicative adjective phrases and their scores are as follows:")
display(adj_score.sort_values(ascending = False)[0:10])


Indicative adjective phrases and their scores are as follows:


personal      0.539736
other         0.256853
squat         0.256001
front         0.250884
indoor        0.208308
free          0.186694
disgusting    0.148436
old           0.132157
more          0.112356
good          0.101845
dtype: float64