### Yelp API

In [13]:
#Business Search        URL -- 'https://api.yelp.com/v3/businesses/search'
#Business Match         URL -- 'https://api.yelp.com/v3/businesses/matches'
#Phone Search           URL -- 'https://api.yelp.com/v3/businesses/search/phone'
#Business Details       URL -- 'https://api.yelp.com/v3/businesses/{id}'
#Business Reviews       URL -- 'https://api.yelp.com/v3/businesses/{id}/reviews'
# article = 'https://open.lib.umn.edu/exploringbusiness/chapter/5-3-what-industries-are-small-businesses-in/#:~:text=About%2020%20percent%20of%20small,of%20the%20overall%20U.S.%20economy.'

### Initialization

In [27]:
# import modules
import requests
import json
from YelpAPI import get_my_key

# Define the API Key, define endpoint, define the header
API_KEY = get_my_key()
ENDPOINT = 'https://api.yelp.com/v3/businesses/search'

HEADERS = {'Authorization': 'bearer %s' % API_KEY}

# Define the parameters
PARAMETERS = {'limit': 50,
              'offset': 50,
              'location': 'San Diego'}

In [28]:
# Make a request to Yelp API

response = requests.get(url = ENDPOINT, params= PARAMETERS, headers= HEADERS)

# convert the JSON string to a dictionary
g = open('yelpdata.json')
yelpdata = json.load(g)
business_data = response.json()
for biz in business_data['businesses']:
    bizid = biz['id']
    rating = biz['rating']
    yelpdata[bizid] = rating
with open('yelpdata.json', 'w') as f:
    json.dump(yelpdata, f, indent=2)

### Get more data !!!

In [None]:
API_KEY = get_my_key()
ENDPOINT = 'https://api.yelp.com/v3/businesses/search'
HEADERS = {'Authorization': 'bearer %s' % API_KEY}
    
for i in range(100):
    PARAMETERS = {'limit': 50,
                  'offset': 50 * i,
                  'location': 'San Diego'}
    response = requests.get(url = ENDPOINT, params= PARAMETERS, headers= HEADERS)
    g = open('yelpdata.json')
    yelpdata = json.load(g)
    business_data = response.json()
    for biz in business_data['businesses']:
        bizid = biz['id']
        rating = biz['rating']
        yelpdata[bizid] = rating
    with open('yelpdata.json', 'w') as f:
        json.dump(yelpdata, f, indent=2)

### Get Reviews !!!

In [51]:
API_KEY = get_my_key()
business_id = '9M_FW_-Ipx93I36w-_ykBg'
ENDPOINT = 'https://api.yelp.com/v3/businesses/{}/reviews'.format(business_id)
HEADERS = {'Authorization': 'bearer %s' % API_KEY}
response = requests.get(url = ENDPOINT, headers= HEADERS)
business_data = response.json()
attitude = []
reviews = []
for review in business_data['reviews']:
    reviews.append(review['text'])
    rating = review['rating']
    if rating > 4:
        temp = 1
    else:
        temp = -1
    attitude.append(temp)
with open('review_text.json', 'w') as f:
    json.dump(reviews, f, indent=2)
with open('sentiment.json', 'w') as f:
    json.dump(attitude, f, indent=2)

### Scale Up. Massive Reviews

In [60]:
API_KEY = get_my_key()
g = open('yelpdata.json')
yelpdata = json.load(g)
id_list = list(yelpdata.keys())
for element in id_list:
    business_id = element
    ENDPOINT = 'https://api.yelp.com/v3/businesses/{}/reviews'.format(business_id)
    HEADERS = {'Authorization': 'bearer %s' % API_KEY}
    response = requests.get(url = ENDPOINT, headers= HEADERS)
    business_data = response.json()
    g = open('sentiment.json')
    attitude = json.load(g)
    g = open('review_text.json')
    reviews = json.load(g)
    for review in business_data['reviews']:
        reviews.append(review['text'])
        rating = review['rating']
        if rating > 4:
            temp = 1
        else:
            temp = -1
        attitude.append(temp)
    with open('review_text.json', 'w') as f:
        json.dump(reviews, f, indent=2)
    with open('sentiment.json', 'w') as f:
        json.dump(attitude, f, indent=2)

### Data has been gathered. New User please run from here

### This section is for functions that we will use

In [7]:
from string import punctuation, digits
import numpy as np
import random
import requests
import json
from YelpAPI import get_my_key

def get_order(n_samples, t):
    random.seed(t)
    indices = list(range(n_samples))
    random.shuffle(indices)
    return indices


def perceptron_single_step_update(
        feature_vector,
        label,
        current_theta,
        current_theta_0):
    """
    Properly updates the classification parameter, theta and theta_0, on a
    single step of the perceptron algorithm.

    Args:
        feature_vector - A numpy array describing a single data point.
        label - The correct classification of the feature vector.
        current_theta - The current theta being used by the perceptron
            algorithm before this update.
        current_theta_0 - The current theta_0 being used by the perceptron
            algorithm before this update.

    Returns: A tuple where the first element is a numpy array with the value of
    theta after the current update has completed and the second element is a
    real valued number with the value of theta_0 after the current updated has
    completed.
    """
    if label * (np.dot(current_theta, feature_vector) + current_theta_0) <= 1e-7:
        current_theta += label * feature_vector
        current_theta_0 += label
    return (current_theta, current_theta_0)

def perceptron(feature_matrix, labels, T):
    """
    Runs the full perceptron algorithm on a given set of data. Runs T
    iterations through the data set, there is no need to worry about
    stopping early.

    NOTE: Please use the previously implemented functions when applicable.
    Do not copy paste code from previous parts.

    NOTE: Iterate the data matrix by the orders returned by get_order(feature_matrix.shape[0])

    Args:
        feature_matrix -  A numpy matrix describing the given data. Each row
            represents a single data point.
        labels - A numpy array where the kth element of the array is the
            correct classification of the kth row of the feature matrix.
        T - An integer indicating how many times the perceptron algorithm
            should iterate through the feature matrix.

    Returns: A tuple where the first element is a numpy array with the value of
    theta, the linear classification parameter, after T iterations through the
    feature matrix and the second element is a real number with the value of
    theta_0, the offset classification parameter, after T iterations through
    the feature matrix.
    """
    (nsamples, nfeatures) = feature_matrix.shape
    theta = np.zeros(nfeatures)
    theta_0 = 0.0
    for t in range(T):
        order = get_order(nsamples, t + 1)
        for i in order:
            theta, theta_0 = perceptron_single_step_update(feature_matrix[i], labels[i], theta, theta_0)
    return (theta, theta_0)

def classify(feature_matrix, theta, theta_0):
    """
    A classification function that uses theta and theta_0 to classify a set of
    data points.

    Args:
        feature_matrix - A numpy matrix describing the given data. Each row
            represents a single data point.
        theta - A numpy array describing the linear classifier.
        theta_0 - A real valued number representing the offset parameter.

    Returns: A numpy array of 1s and -1s where the kth element of the array is
    the predicted classification of the kth row of the feature matrix using the
    given theta and theta_0. If a prediction is GREATER THAN zero, it should
    be considered a positive classification.
    """
    # Your code here
    result = np.zeros(feature_matrix.shape[0])
    for i in range(feature_matrix.shape[0]):
        value = np.dot(feature_matrix[i], theta) + theta_0
        if value > 0:
            result[i] = 1
        else:
            result[i] = -1
    return result

def bag_of_words(texts):
    """
    Inputs a list of string reviews
    Returns a dictionary of unique unigrams occurring over the input

    Feel free to change this code as guided by Problem 9
    """
    # Your code here
    dictionary = {} # maps word to unique index
    for text in texts:
        word_list = extract_words(text)
        for word in word_list:
            if word not in dictionary:
                dictionary[word] = len(dictionary)
    return dictionary

def extract_bow_feature_vectors(reviews, dictionary):
    """
    Inputs a list of string reviews
    Inputs the dictionary of words as given by bag_of_words
    Returns the bag-of-words feature matrix representation of the data.
    The returned matrix is of shape (n, m), where n is the number of reviews
    and m the total number of entries in the dictionary.

    Feel free to change this code as guided by Problem 9
    """
    # Your code here

    num_reviews = len(reviews)
    feature_matrix = np.zeros([num_reviews, len(dictionary)])

    for i, text in enumerate(reviews):
        word_list = extract_words(text)
        for word in word_list:
            if word in dictionary:
                feature_matrix[i, dictionary[word]] += 1
    return feature_matrix

def most_explanatory_word(theta, wordlist):
    """Returns the word associated with the bag-of-words feature having largest weight."""
    return [word for (theta_i, word) in sorted(zip(theta, wordlist))[::-1]]

### Please Download nltk natural language processing library for this section

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

def extract_words(text):
    '''
    Helper function for bag_of_words()
    Inputs a text string
    '''
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/qinjianxyz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qinjianxyz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Time to input our data into the model

In [None]:
g = open('sentiment.json')
train_labels = json.load(g)
g = open('review_text.json')
train_texts = json.load(g)
dictionary = bag_of_words(train_texts)
train_features = extract_bow_feature_vectors(train_texts, dictionary)
theta, theta_0 = perceptron(train_features, train_labels, 1000)
wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
sorted_word_features = most_explanatory_word(theta, wordlist)
print("Most Explanatory Word Features")
print(sorted_word_features[:100])

**(Feature += 1) Most explanatory words:** ['twice', 'vegan', 'yet', 'thank', 'packed', 'omg', 'limited', 'la', 'keep', 'year', 'worth', 'weird', 'want', 'usually', 'tofu', 'thought', 'street', 'stephen', 'spotty', 'sandwiches', 'quickly', 'pacific', 'owner', 'ocean', 'ob', 'miss', 'longer', 'line', 'instead', 'ingredients', 'handling', 'excited', 'end', 'earlier', 'dumplings', 'curry', 'cakes', 'cafe', 'works', 'win', 'warm', 'walked', 'togo', 'still', 'reviewed', 'restaurants', 'reservation', 'quality', 'prices', 'precautions', 'polite', 'plate', 'pineapple', 'phenomenal', 'perfect', 'outstanding', 'opened', 'number', 'notice', 'nom', 'months', 'monday', 'mocha', 'left', 'kids', 'incredible', 'impeccably', 'hillcrest', 'grubhub', 'garlic', 'finally', 'favorite', 'exceptional', 'every', 'ever', 'enjoyable', 'enjoy', 'enchilada', 'dish', 'customers', 'current', 'creamy', 'corner', 'closer', 'chose', 'business', 'bomb', 'boats', 'bite', 'best', 'arrived', 'app', 'added', 'write', 'wore', 'wonderful', 'whenever', 'went', 'way', 'watch'] <br />
**(Feature = 1) Most explanatory words:** ['yet', 'la', 'vegan', 'usually', 'omg', 'cafe', 'worth', 'twice', 'tofu', 'reviewed', 'real', 'dumplings', 'year', 'write', 'wore', 'want', 'walked', 'tell', 'surf', 'street', 'stephen', 'return', 'restaurants', 'reservation', 'pieces', 'owner', 'outstanding', 'opening', 'opened', 'obsession', 'number', 'monday', 'miss', 'longer', 'line', 'limited', 'left', 'kids', 'incredible', 'immediately', 'hillcrest', 'excited', 'earlier', 'corn', 'completely', 'ca', 'business', 'bite', 'bartender', 'added', 'wonderful', 'went', 'welcoming', 'weird', 'watch', 'valentine', 'upstairs', 'totally', 'tonya', 'thank', 'stunning', 'steak', 'stand', 'spotty', 'sports', 'size', 'sister', 'sandwiches', 'rider', 'reserve', 'quickly', 'quarantine', 'quality', 'prices', 'prefer', 'precautions', 'post', 'polite', 'plate', 'pineapple', 'pie', 'perfect', 'pasta', 'partner', 'packed', 'opportunity', 'ocean', 'ob', 'notice', 'noon', 'nom', 'morning', 'menu', 'lovely', 'lounges', 'lots', 'later', 'knowledgeable', 'knew', 'keeping'