# import statements 

In [2]:
import spacy
lang='en'
nlp = spacy.load('en')

# restaurant review class

In [3]:
class RestaurantReviewFeatures(object):
    """Stores reviews data and rating data for one restaurant
    
    Attributes:
        id (str): id of the restaurant, this will be used as Primary Key
        index (int): index of last added review
        rating (dict): rating of this restaurant
        reviews (dict): reviews of this restaurant
    
    """
    def __init__(self, key):
        """Initialize class
        """
        self.id = key    
        self.index = 0
        self.rating = {}
        self.reviews = {}
        
    def __getitem__(self, idx):
        """returns features of a specific review
        
        Args:
            idx (int): index of the review
        
        Returns:
            dict: The review dictionary at index=idx.
            
        """
        return self.reviews[idx]

    def __len_(self):
        """returns total reviews for this restaurant
        
        Returns:
            int: total reviews for this restaurant
        """
        return self.index
    
    def addReview(self, rating,unigrams=[], bigrams1=[], bigrams2=[], trigrams=[], dataset="", nouns=[], raw=[]):
        """Adds a review
        
        Args:
            rating (float): rating of the review
            'nouns' (list): list of noun_chunks in review
            raw (list): list of tokens in review (raw features)
            unigrams (list): list of unigrams in review
            bigrams1 (list): list of bigrams 'ADJ + next word'
            "bigrams2" (list): list of bigrams 'prev word + ADJ'
            "trigrams" (list): list of trigrams
            "rating" (list): rating of that review
            "dataset" (str): original dataset
        """
        if len(unigrams)>0 or len(bigrams1)>0 or len(trigrams) or len(raw)>0:
            self.reviews[self.index] = {'nouns':nouns, 'raw':raw, 'unigrams':unigrams, 'bigrams':bigrams1, 'bigrams2':bigrams2, 'trigrams':trigrams, 'rating':rating,'dataset':dataset}
            self.index += 1
    
    def addRating(self, rating, dataset):
        """Adds rating for the restaurant
        
        Args:
            rating (float): rating value
            dataset (string): original dataset
        """
        if rating > 0:
            self.rating[dataset] = rating
    
    def getRating(self, dataset="avg"):
        """returns rating for the restaurant
        
        Args:
            dataset (string): original dataset name.
            possible values for dataset = ["avg","google","yelp"]
            
        Returns:
            float: rating value
        """
        if dataset == "avg":
            return float(sum(self.rating.values()))/len(self.rating)
        else:
            return self.rating[dataset]

## review cleanup

In [4]:
def cleanup_text(doc):
    """Initial cleanup of the review
    
    Args:
        doc (str): review string
        
    Note:
        removes all stopwords and punctuations from the string
    
    Returns:
        str: cleaned string
    """
    doc = nlp(doc, disable=['parser', 'ner'])
    tokens = [tok.lemma_.lower().strip() for tok in doc if (tok.lemma_ != '-PRON-' and tok.is_stop==False and tok.is_punct==False)]
    tokens = ' '.join(tokens)
    return tokens

## process one restaurant

In [11]:
def processOneRestaurant(restaurant, log=True):
    """Process reviews of one restaurnat
    
    Note:
        returns RestaurantReviewFeatures object containing all reivews and rating information
        
    Args:
        restaurant (dict): dictionary containing restaurant info
    
    Returns:
        Object: RestaurantReviewFeatures class object
    
    """
    
    if "google" in restaurant:
        restaurant_id = restaurant["google"]["place_id"]
    else:
        restaurant_id = restaurant["yelp"]["id"]
    
    r = RestaurantReviewFeatures(restaurant_id)
    for dataset in restaurant:
        #print("DataSet:" + dataset)    
        if "rating" in restaurant[dataset]:
            r.addRating(restaurant[dataset]["rating"], dataset.replace("ui",""))
        
        if "reviews" not in restaurant[dataset].keys():
            if log:
                print("{} : NO reviews found for {} dataset".format(restaurant_id,dataset))
        else:
            ##print(len(restaurant[dataset]["reviews"]))
            for review in restaurant[dataset]["reviews"]:
                review_text = review["text"].strip()
                review_text = cleanup_text(review_text)
                #print(review_text)
                doc = nlp(review_text, disable=['ner'])
                uni_feature_list = []
                bi_feature_list = []
                bi_feature_list2 = []
                tri_feature_list = []
                raw_feature_list = review_text.split(' ')
                for i in range(len(doc)):
                    token = doc[i]
                    if token.tag_.startswith('J'):
                        uni_feature_list.append(token.lemma_)
                        if i<len(doc)-1:
                            #print(token.text,":" ,doc[i-1].lemma_, token.lemma_, doc[i+1].lemma_)
                            bi_feature_list.append(token.lemma_+" "+doc[i+1].lemma_)
                            if i>0:
                                tri_feature_list.append(doc[i-1].lemma_+" "+token.lemma_+" "+doc[i+1].lemma_)
                        if i>0:
                            bi_feature_list2.append(doc[i-1].lemma_+" "+token.lemma_)
                            
                            
                review_stars = None
                if "stars" in review:
                    review_stars = review["stars"]
                if "rating" in review:
                    review_stars = review["rating"]
                
                if review_stars != None:
                    r.addReview(review_stars, unigrams=uni_feature_list, bigrams1=bi_feature_list,
                                bigrams2= bi_feature_list2, trigrams=tri_feature_list, raw=raw_feature_list,
                                nouns=doc.noun_chunks, dataset=dataset)
    return r

# Sample code to process one restaurant from combined dataset

In [16]:
import json
import time

t1 = time.time()

review_dict = {}
noun_chunks = set()
with open("C:\\Users\\utkar\\Downloads\\large_data.json","r") as f:
    ## use parallel processing (Pool.map()) here to expedite processing
    while(True):
        line = f.readline()

        if not line:
            break

        restaurant = json.loads(line)
        r = processOneRestaurant(restaurant)
        #assert(r.id not in review_dict)
        #review_dict[r.id] = r
        for k in r.reviews.keys():
            nouns = r.reviews[k]["nouns"]
            for n in list(nouns):
                noun_chunks.add(n.text)
        del r
        
print("Number of restaurants: " + len(review_dict))
#f.close()

t2 = time.time()
print("{}s to process {} restaurants".format(t2-t1, len(review_dict)))

f =open("C:\\Users\\utkar\\Downloads\\nouns.txt","w")
data = str(list(noun_chunks)).replace(",","\n").replace("'")
f.write()
f.close()

ChIJhc9r0waWwoAR70Xf1HZKbo4 : NO reviews found for yelp dataset
ChIJUVPwAv-9woAR_dnF22r-D00 : NO reviews found for yelp dataset
ChIJQapGk_y9woARKIZHrNZEiwE : NO reviews found for yelp dataset
ChIJF4wAsh-WwoAR1Kniw_Fp5tw : NO reviews found for yelp dataset
ChIJzc2KDv29woARBsKstcFPywQ : NO reviews found for yelp dataset
ChIJv52y0QaWwoARf-i27AO-mwk : NO reviews found for yelp dataset
ChIJa8kfNxiWwoARD2sK_jeDmXU : NO reviews found for yelp dataset
ChIJGc8NNvS9woAR9CH2BEPBtRQ : NO reviews found for yelp dataset
ChIJz7DIi_C9woARgtCR9Dh1sRU : NO reviews found for yelp dataset
ChIJ3XGSYe6_woARNaMGJY-URaQ : NO reviews found for yelp dataset
ChIJa53kr8K_woARFIQv7Qx7hvQ : NO reviews found for yelp dataset
ChIJx2CNWOW_woARkaMnjvUpI5g : NO reviews found for yelp dataset


MemoryError: 

In [None]:
print(review_dict.keys())

In [None]:
review_dict['ChIJQapGk_y9woARKIZHrNZEiwE'].rating

In [None]:
ambience = []

In [None]:
service = []

In [None]:
food = []