In [73]:
import numpy
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
import json
import random
import string

Importing the dataset

In [74]:
with open('reviewSelected100.json') as f:
    data = json.loads("[" + 
        f.read().replace("}\n{", "},\n{") + 
    "]")

In [75]:
# Create a set of all businesses
business_set = set()
for review in data:
    business_set.add(review['business_id'])
print("Number of businesses: " + str(len(business_set)))

Number of businesses: 153


## Finding the Adjective Phrases of a Randomly Selected Business

Choose a random business

In [76]:
business_set = list(business_set)
selected_business_id = business_set[random.randrange(len(business_set))]

In [77]:
selected_business_id

'p6FPcgLymnpk_gAyQuW_Mw'

Extract out the adjective phrases from all the reviews belonging to this business

In [78]:
# Get all the reviews of the business
selected_business_reviews = []
for review in data:
    if review['business_id'] == selected_business_id:
        selected_business_reviews.append(review['text'])

Define the grammar rules for the structure of Adjective Phrases (ADJP), as well as other phrase types. 

In [79]:
# Final
grammar = ('''
    VP: { <TO> <VB> <PRP> <VB.?> | <TO> <VB> <VB.?>? <DT>? <JJ.?>? <NN.?>? }
    NC: { <IN>? ((<PRP.>|<DT>) <NN.?> <JJ.?>? <MD> <VB.?>) | <IN>? ((<DT>|<PRP.>) <JJ.?>? <NN.?> <VB.?> (<NN.?>|<VB.>)) | <IN>? <PRP> <VB.?> (<DT>|<PRP.?>) <NN.?> }
    PP: { (<IN> <DT>? <JJ.?>? <NN.?>) }
    ADJP: { (<IN> <DT>? <NN.?>) | (<PP> <IN> <NN.?>) | <RB.?>* <JJ.?> <PP> | <RB.?>* <JJ.?> <VP> | <RB.?>* <JJ.?> <NC> | (<RB.?>* <JJ.?> <,>? <CC>?)+ | (<JJ.?> <,>? <CC>?)+ | <JJ.?>* | <RB.?>+ <VB(G|N)> }
    ''')

# Preposition + noun | PP + noun | (adv) adj + PP | (adv) adj + VP | (adv) adj + NC | Adverb Adjective(s) | Adjective chain(s) | Adjective(s) | Some VBG/N

In [80]:
chunkParser = nltk.RegexpParser(grammar)

Parse each review, analysing its phrase structure and extracting out only the adjective phrases

In [81]:
adjective_phrases = []

for review in selected_business_reviews:
    # tokenize and perform pos tagging on each review
    tagged_tokens = nltk.pos_tag(nltk.word_tokenize(review))
    # parse the review
    tree = chunkParser.parse(tagged_tokens)
    # go through the parse tree and append adjective phrases (ADJP)
    for subtree in tree.subtrees():
        if subtree.label() == "ADJP":
            adjective_phrase = []
            for leaf in subtree.leaves():  # each leaf is a tuple (token, pos_tag)
                adjective_phrase.append(leaf[0]) # append the token
            adjective_phrase = " ".join(adjective_phrase) # merge the tokens of the phrase
            adjective_phrases.append(adjective_phrase)

In [82]:
adjective_phrases

['awesome and cheap',
 'literal',
 'Typical taco',
 'awesome , authentic , and',
 'go',
 'beat',
 'much more',
 'huge',
 'def larger',
 'Happy',
 'Excellent',
 'mexican',
 'really good',
 'hot',
 'u',
 'im',
 'best',
 'open late and',
 'desperate',
 'final',
 'just flinging',
 'tiring',
 'plastic',
 'no longer eating',
 'Best',
 'able to put',
 'really good',
 'fast',
 'i',
 'tacos',
 'perfect',
 'add',
 'single',
 'always left',
 'happy and',
 'top of the line',
 'single',
 'more',
 'hot and fresh',
 'wide',
 'best',
 'Best bodega',
 'great',
 'fresh',
 'great and',
 'not only insanely delicious , but',
 'most of the time',
 'much',
 'hefty',
 'full ,',
 'super',
 'great',
 'little',
 'native',
 'best mexican',
 'top',
 'Very tasty , very clean',
 'Generous',
 'low',
 'great',
 'clean and',
 'quick',
 'busy',
 'new regular',
 'best',
 'other',
 'huge',
 'fast and friendly',
 'huge',
 'good',
 'finally found',
 'late',
 'good',
 'fish &',
 'favorite',
 'really just going',
 'crazy , an

### Now, we need to determine which adjective phrases are indicative
We perform TD-IDF in order to determine which phrases are unique to the business
- Find the TF of each adjective phrase
- Create a biword/inverted index (TBC) to store the Document Frequency of each biword/term
- Compute the TF-IDF of each adjective phrase

First, preprocess the entire dataset before performing TF-IDF

In [46]:
# Lowercase all the text
def convert_to_lowercase(text):
    return text.lower()

In [47]:
# Remove numerical values
def remove_numbers(text):
    return ''.join([i for i in text if not i.isdigit()])

In [48]:
# Sentence tokenizer
def sentence_tokenize(text):
    return sent_tokenize(text)

In [49]:
# Remove punctuations
def remove_punctuation(sentences):
    result = []
    for sentence in sentences:
        result.append(sentence.translate(str.maketrans('', '', string.punctuation)))
    return result

In [50]:
# Tokenize the sentences -> words
def tokenize(sentences):
    result = []
    for sentence in sentences:
        result.append(word_tokenize(sentence))
    return result

In [51]:
# Remove stopwords
def remove_stopwords(sentences):
    result = []
    stopwords = nltk.corpus.stopwords.words('english')
    for sentence in sentences:
        filtered = []
        for word in sentence:
            if word not in stopwords:
                filtered.append(word)
        result.append(filtered)
    return result

In [35]:
# Stemming
def stemming(sentences): #text is an array of strings
    result = []
    ps = PorterStemmer()
    for sentence in sentences:
        result.append([ps.stem(word) for word in sentence])
    return result

In [59]:
# preprocessing
def preprocess(text):
    text = convert_to_lowercase(text)
    text = remove_numbers(text)
    text = sentence_tokenize(text)
    text = remove_punctuation(text)
    text = tokenize(text)
    text = remove_stopwords(text)
    text = stemming(text)
    return text
# Outputs a list of lists containing tokens

Create biword/uniword index: Dictionary with bigram/word as the key and the value being the set of businesses that contain that bigram/word

In [70]:
# Generate bigrams given an array of arrays of strings
def generate_bigrams(sentences):
    bigrams = []
    for sentence in sentences:
        for i in range(len(sentence) - 1):
            bigrams.append((sentence[i], sentence[i+1]))
    return bigrams

In [92]:
# Create the index
index = {}
for review in data:
    b_id = review['business_id']
    preprocessed_text = preprocess(review['text']) # Array of arrays of tokens
    bigrams = generate_bigrams(preprocessed_text)
    
    # index the unigrams
    for sentence in preprocessed_text:
        for word in sentence:
            try:
                index[word].add(b_id)
            except:
                index[word] = {b_id}
    # index the bigrams
    for bigram in bigrams:
        try:
            index[bigram].add(b_id)
        except:
            index[bigram] = {b_id}

In [104]:
# Since we don't need to list of business ids in the index, we convert it to the count of business ids
for key in index:
    index[key] = len(index[key])

#### Calculating TF-IDF

###### Term Frequency
Term frequency is calculated by number of occurrences of the term divided by the number of terms in the entire document (in this case, the business)

In [124]:
# Find word count
word_count = 0
for review in data:
    if review['business_id'] == selected_business_id:
        for sentence in preprocess(review['text']):
            word_count += len(sentence)

In [125]:
word_count

3747

In [133]:
# Find bigram count
bigram_count = 0
for review in data:
    if review['business_id'] == selected_business_id:
        sentences = preprocess(review['text'])
        bigrams = generate_bigrams(sentences)
        bigram_count += len(bigrams)

In [134]:
bigram_count

3173

In [135]:
# calculate the TF
def computeTF(wordDict, bagOfWords, hasAdjPhrases):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        if hasAdjPhrases == True:
            tfDict[word] = count / float(bagOfWordsCount)
        else:
            tfDict[word] = count / 1
    return tfDict

def find_ngram_count(ngram, business_id, data):
    if len(ngram) == 1:
        
    else:
        

def computeTF(ngram, total_count):
    return ngram / total_count

In [116]:
adjective_phrases

['awesome and cheap',
 'literal',
 'Typical taco',
 'awesome , authentic , and',
 'go',
 'beat',
 'much more',
 'huge',
 'def larger',
 'Happy',
 'Excellent',
 'mexican',
 'really good',
 'hot',
 'u',
 'im',
 'best',
 'open late and',
 'desperate',
 'final',
 'just flinging',
 'tiring',
 'plastic',
 'no longer eating',
 'Best',
 'able to put',
 'really good',
 'fast',
 'i',
 'tacos',
 'perfect',
 'add',
 'single',
 'always left',
 'happy and',
 'top of the line',
 'single',
 'more',
 'hot and fresh',
 'wide',
 'best',
 'Best bodega',
 'great',
 'fresh',
 'great and',
 'not only insanely delicious , but',
 'most of the time',
 'much',
 'hefty',
 'full ,',
 'super',
 'great',
 'little',
 'native',
 'best mexican',
 'top',
 'Very tasty , very clean',
 'Generous',
 'low',
 'great',
 'clean and',
 'quick',
 'busy',
 'new regular',
 'best',
 'other',
 'huge',
 'fast and friendly',
 'huge',
 'good',
 'finally found',
 'late',
 'good',
 'fish &',
 'favorite',
 'really just going',
 'crazy , an

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [109]:
corpus = [
'This is the first document. And what?',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?']

In [110]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [111]:
X = vectorizer.fit_transform(corpus)
tokens = vectorizer.get_feature_names()

In [112]:
X.toarray()

array([[0.28339342, 0.        , 0.35944862, 0.2294314 , 0.35944862,
        0.        , 0.28339342, 0.28339342, 0.1875752 , 0.2294314 ,
        0.        , 0.        , 0.        , 0.        , 0.1875752 ,
        0.28339342, 0.        , 0.        , 0.        , 0.        ,
        0.1875752 , 0.        , 0.28339342, 0.        , 0.35944862],
       [0.        , 0.        , 0.        , 0.45551258, 0.        ,
        0.35682424, 0.        , 0.        , 0.18620569, 0.22775629,
        0.        , 0.        , 0.35682424, 0.35682424, 0.18620569,
        0.        , 0.35682424, 0.        , 0.        , 0.        ,
        0.18620569, 0.35682424, 0.        , 0.        , 0.        ],
       [0.28851197, 0.36594085, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.19096312, 0.2335753 ,
        0.        , 0.36594085, 0.        , 0.        , 0.19096312,
        0.        , 0.        , 0.36594085, 0.36594085, 0.36594085,
        0.19096312, 0.        , 0.28851197, 0.

In [113]:
df = pd.DataFrame(data=X.toarray(), index=['Doc1','Doc2','Doc3','Doc4'], columns=tokens)

In [114]:
df

Unnamed: 0,and,and this,and what,document,document and,document is,first,first document,is,is the,...,the first,the second,the third,third,third one,this,this document,this is,this the,what
Doc1,0.283393,0.0,0.359449,0.229431,0.359449,0.0,0.283393,0.283393,0.187575,0.229431,...,0.283393,0.0,0.0,0.0,0.0,0.187575,0.0,0.283393,0.0,0.359449
Doc2,0.0,0.0,0.0,0.455513,0.0,0.356824,0.0,0.0,0.186206,0.227756,...,0.0,0.356824,0.0,0.0,0.0,0.186206,0.356824,0.0,0.0,0.0
Doc3,0.288512,0.365941,0.0,0.0,0.0,0.0,0.0,0.0,0.190963,0.233575,...,0.0,0.0,0.365941,0.365941,0.365941,0.190963,0.0,0.288512,0.0,0.0
Doc4,0.0,0.0,0.0,0.28294,0.0,0.0,0.349487,0.349487,0.231322,0.0,...,0.349487,0.0,0.0,0.0,0.0,0.231322,0.0,0.0,0.443279,0.0


In [115]:
tokens

['and',
 'and this',
 'and what',
 'document',
 'document and',
 'document is',
 'first',
 'first document',
 'is',
 'is the',
 'is this',
 'one',
 'second',
 'second document',
 'the',
 'the first',
 'the second',
 'the third',
 'third',
 'third one',
 'this',
 'this document',
 'this is',
 'this the',
 'what']

In [35]:
X

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [36]:
vectorizer.idf_

array([1.91629073, 1.22314355, 1.51082562, 1.        , 1.91629073,
       1.91629073, 1.        , 1.91629073, 1.        ])

In [37]:
vectorizer.stop_words_

set()

In [40]:
X.todense()

matrix([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
         0.        , 0.38408524, 0.        , 0.38408524],
        [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
         0.53864762, 0.28108867, 0.        , 0.28108867],
        [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
         0.        , 0.26710379, 0.51184851, 0.26710379],
        [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
         0.        , 0.38408524, 0.        , 0.38408524]])

In [42]:
from sklearn.preprocessing import normalize

In [44]:
normalize(X).toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [45]:
vectorizer.idf_ * 2

array([3.83258146, 2.4462871 , 3.02165125, 2.        , 3.83258146,
       3.83258146, 2.        , 3.83258146, 2.        ])

In [49]:
normalize((vectorizer.idf_ * 2).reshape(1,-1))

array([[0.413592  , 0.26399042, 0.32608068, 0.21582946, 0.413592  ,
        0.413592  , 0.21582946, 0.413592  , 0.21582946]])