### Yelp API

In [None]:
#Business Search        URL -- 'https://api.yelp.com/v3/businesses/search'
#Business Match         URL -- 'https://api.yelp.com/v3/businesses/matches'
#Phone Search           URL -- 'https://api.yelp.com/v3/businesses/search/phone'
#Business Details       URL -- 'https://api.yelp.com/v3/businesses/{id}'
#Business Reviews       URL -- 'https://api.yelp.com/v3/businesses/{id}/reviews'
# article = 'https://open.lib.umn.edu/exploringbusiness/chapter/5-3-what-industries-are-small-businesses-in/#:~:text=About%2020%20percent%20of%20small,of%20the%20overall%20U.S.%20economy.'

### Initialization

In [None]:
# import modules
import requests
import json
from YelpAPI import get_my_key

In [None]:
# Define the API Key, define endpoint, define the header
API_KEY = get_my_key()
ENDPOINT = 'https://api.yelp.com/v3/businesses/search'
HEADERS = {'Authorization': 'bearer %s' % API_KEY}
# Define the parameters
PARAMETERS = {'limit': 50,
              'offset': 0,
              'term': 'construction',
              'location': 'California'}
# Make a request to Yelp API
response = requests.get(url = ENDPOINT, params= PARAMETERS, headers= HEADERS)
business_data = response.json()
print(business_data)

In [27]:
# convert the JSON string to a dictionary
g = open('construction.json')
yelpdata = json.load(g)
business_data = response.json()
for biz in business_data['businesses']:
    bizid = biz['id']
    rating = biz['rating']
    yelpdata[bizid] = rating
with open('construction.json', 'w') as f:
    json.dump(yelpdata, f, indent=2)

### Get more data !!!

In [28]:
API_KEY = get_my_key()
ENDPOINT = 'https://api.yelp.com/v3/businesses/search'
HEADERS = {'Authorization': 'bearer %s' % API_KEY}
    
for i in range(20):
    PARAMETERS = {'limit': 50,
              'offset': 50 * i,
              'term': 'construction',
              'location': 'California'}
    response = requests.get(url = ENDPOINT, params= PARAMETERS, headers= HEADERS)
    g = open('restaurant.json')
    yelpdata = json.load(g)
    business_data = response.json()
    for biz in business_data['businesses']:
        bizid = biz['id']
        rating = biz['rating']
        yelpdata[bizid] = rating
    with open('construction.json', 'w') as f:
        json.dump(yelpdata, f, indent=2)

### Get Reviews !!!

In [29]:
API_KEY = get_my_key()
business_id = '9M_FW_-Ipx93I36w-_ykBg'
ENDPOINT = 'https://api.yelp.com/v3/businesses/{}/reviews'.format(business_id)
HEADERS = {'Authorization': 'bearer %s' % API_KEY}
response = requests.get(url = ENDPOINT, headers= HEADERS)
business_data = response.json()
attitude = []
reviews = []
for review in business_data['reviews']:
    reviews.append(review['text'])
    rating = review['rating']
    if rating > 4:
        temp = 1
    else:
        temp = -1
    attitude.append(temp)
with open('const_review.json', 'w') as f:
    json.dump(reviews, f, indent=2)
with open('const_sentiment.json', 'w') as f:
    json.dump(attitude, f, indent=2)

### Scale Up. Massive Reviews

In [32]:
API_KEY = get_my_key()
g = open('construction.json')
yelpdata = json.load(g)
id_list = list(yelpdata.keys())
for element in id_list:
    business_id = element
    ENDPOINT = 'https://api.yelp.com/v3/businesses/{}/reviews'.format(business_id)
    HEADERS = {'Authorization': 'bearer %s' % API_KEY}
    response = requests.get(url = ENDPOINT, headers= HEADERS)
    business_data = response.json()
    g = open('const_sentiment.json')
    attitude = json.load(g)
    g = open('const_review.json')
    reviews = json.load(g)
    if 'error' in business_data:
        continue
    for review in business_data['reviews']:
        reviews.append(review['text'])
        rating = review['rating']
        if rating > 4:
            temp = 1
        else:
            temp = -1
        attitude.append(temp)
    with open('const_review.json', 'w') as f:
        json.dump(reviews, f, indent=2)
    with open('const_sentiment.json', 'w') as f:
        json.dump(attitude, f, indent=2)

### Data has been gathered. New User please run from here
### This section is for functions that we will initially use
### We will use the scikit learn library eventually

In [1]:
from string import punctuation, digits
import numpy as np
import random
import requests
import json
from YelpAPI import get_my_key

def get_order(n_samples, t):
    random.seed(t)
    indices = list(range(n_samples))
    random.shuffle(indices)
    return indices


def perceptron_single_step_update(
        feature_vector,
        label,
        current_theta,
        current_theta_0):
    """
    Properly updates the classification parameter, theta and theta_0, on a
    single step of the perceptron algorithm.

    Args:
        feature_vector - A numpy array describing a single data point.
        label - The correct classification of the feature vector.
        current_theta - The current theta being used by the perceptron
            algorithm before this update.
        current_theta_0 - The current theta_0 being used by the perceptron
            algorithm before this update.

    Returns: A tuple where the first element is a numpy array with the value of
    theta after the current update has completed and the second element is a
    real valued number with the value of theta_0 after the current updated has
    completed.
    """
    if label * (np.dot(current_theta, feature_vector) + current_theta_0) <= 1e-7:
        current_theta += label * feature_vector
        current_theta_0 += label
    return (current_theta, current_theta_0)

def perceptron(feature_matrix, labels, T):
    """
    Runs the full perceptron algorithm on a given set of data. Runs T
    iterations through the data set, there is no need to worry about
    stopping early.

    NOTE: Please use the previously implemented functions when applicable.
    Do not copy paste code from previous parts.

    NOTE: Iterate the data matrix by the orders returned by get_order(feature_matrix.shape[0])

    Args:
        feature_matrix -  A numpy matrix describing the given data. Each row
            represents a single data point.
        labels - A numpy array where the kth element of the array is the
            correct classification of the kth row of the feature matrix.
        T - An integer indicating how many times the perceptron algorithm
            should iterate through the feature matrix.

    Returns: A tuple where the first element is a numpy array with the value of
    theta, the linear classification parameter, after T iterations through the
    feature matrix and the second element is a real number with the value of
    theta_0, the offset classification parameter, after T iterations through
    the feature matrix.
    """
    (nsamples, nfeatures) = feature_matrix.shape
    theta = np.zeros(nfeatures)
    theta_0 = 0.0
    for t in range(T):
        order = get_order(nsamples, t + 1)
        for i in order:
            theta, theta_0 = perceptron_single_step_update(feature_matrix[i], labels[i], theta, theta_0)
    return (theta, theta_0)

def classify(feature_matrix, theta, theta_0):
    """
    A classification function that uses theta and theta_0 to classify a set of
    data points.

    Args:
        feature_matrix - A numpy matrix describing the given data. Each row
            represents a single data point.
        theta - A numpy array describing the linear classifier.
        theta_0 - A real valued number representing the offset parameter.

    Returns: A numpy array of 1s and -1s where the kth element of the array is
    the predicted classification of the kth row of the feature matrix using the
    given theta and theta_0. If a prediction is GREATER THAN zero, it should
    be considered a positive classification.
    """
    # Your code here
    result = np.zeros(feature_matrix.shape[0])
    for i in range(feature_matrix.shape[0]):
        value = np.dot(feature_matrix[i], theta) + theta_0
        if value > 0:
            result[i] = 1
        else:
            result[i] = -1
    return result

def bag_of_words(texts):
    """
    Inputs a list of string reviews
    Returns a dictionary of unique unigrams occurring over the input

    Feel free to change this code as guided by Problem 9
    """
    # Your code here
    dictionary = {} # maps word to unique index
    for text in texts:
        word_list = extract_words(text)
        for word in word_list:
            if word not in dictionary:
                dictionary[word] = len(dictionary)
    return dictionary

def extract_bow_feature_vectors(reviews, dictionary):
    """
    Inputs a list of string reviews
    Inputs the dictionary of words as given by bag_of_words
    Returns the bag-of-words feature matrix representation of the data.
    The returned matrix is of shape (n, m), where n is the number of reviews
    and m the total number of entries in the dictionary.

    Feel free to change this code as guided by Problem 9
    """
    # Your code here

    num_reviews = len(reviews)
    feature_matrix = np.zeros([num_reviews, len(dictionary)])

    for i, text in enumerate(reviews):
        word_list = extract_words(text)
        for word in word_list:
            if word in dictionary:
                feature_matrix[i, dictionary[word]] += 1
    return feature_matrix

def most_explanatory_word(theta, wordlist):
    """Returns the word associated with the bag-of-words feature having largest weight."""
    return [word for (theta_i, word) in sorted(zip(theta, wordlist))[::-1]]

### Please Download nltk natural language processing library for this section

In [2]:
import nltk
#nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

def extract_words(text):
    '''
    Helper function for bag_of_words()
    Inputs a text string
    '''
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return words


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/qinjianxyz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/qinjianxyz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Time to input our data into the model

In [37]:
g = open('yelp_sentiment.json')
train_labels = json.load(g)
g = open('yelp_review.json')
train_texts = json.load(g)
dictionary = bag_of_words(train_texts)
train_features = extract_bow_feature_vectors(train_texts, dictionary)
theta, theta_0 = perceptron(train_features, train_labels, 1000)
wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
sorted_word_features = most_explanatory_word(theta, wordlist)
print("Most Explanatory Word Features")
print(sorted_word_features[:30])

Most Explanatory Word Features
['civita', 'rachel', 'islands', 'opportune', 'feech', 'affairs', 'smothered', 'frostie', 'strictly', 'signage', 'etiquette', 'arts', 'square', 'logistics', 'indianpakistani', 'everywhere', 'trigger', 'prompt', 'post', 'maskless', 'fixings', 'batch', 'aound', 'wishing', 'term', 'suddenly', 'snow', 'overwhelming', 'listed', 'juicy']


**Most explanatory words:** ['civita', 'rachel', 'islands', 'opportune', 'feech', 'affairs', 'smothered', 'frostie', 'strictly', 'signage', 'etiquette', 'arts', 'square', 'logistics', 'indianpakistani', 'everywhere', 'trigger', 'prompt', 'post', 'maskless', 'fixings', 'batch', 'aound', 'wishing', 'term', 'suddenly', 'snow', 'overwhelming', 'listed', 'juicy']

## Restaurant Business Analysis

In [38]:
g = open('rest_sentiment.json')
train_labels = json.load(g)
g = open('rest_review.json')
train_texts = json.load(g)
dictionary = bag_of_words(train_texts)
train_features = extract_bow_feature_vectors(train_texts, dictionary)
theta, theta_0 = perceptron(train_features, train_labels, 1000)
wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
sorted_word_features = most_explanatory_word(theta, wordlist)
print("Most Explanatory Word Features")
print(sorted_word_features[:30])

Most Explanatory Word Features
['gem', 'wonderful', 'update', 'love', 'worked', 'tough', 'tacos', 'phase', 'party', 'nicely', 'next', 'mammoth', 'indoor', 'best', 'absolutely', 'visiting', 'usually', 'urban', 'tuna', 'site', 'short', 'servings', 'right', 'recommendation', 'recently', 'reasonable', 'quarantine', 'pokiland', 'patio', 'pandemic']


**Most explanatory words:** ['gem', 'wonderful', 'update', 'love', 'worked', 'tough', 'tacos', 'phase', 'party', 'nicely', 'next', 'mammoth', 'indoor', 'best', 'absolutely', 'visiting', 'usually', 'urban', 'tuna', 'site', 'short', 'servings', 'right', 'recommendation', 'recently', 'reasonable', 'quarantine', 'pokiland', 'patio', 'pandemic']

## Construction Business Analysis

In [39]:
g = open('const_sentiment.json')
train_labels = json.load(g)
g = open('const_review.json')
train_texts = json.load(g)
dictionary = bag_of_words(train_texts)
train_features = extract_bow_feature_vectors(train_texts, dictionary)
theta, theta_0 = perceptron(train_features, train_labels, 1000)
wordlist = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
sorted_word_features = most_explanatory_word(theta, wordlist)
print("Most Explanatory Word Features")
print(sorted_word_features[:30])

Most Explanatory Word Features
['wonderful', 'update', 'seating', 'quarantine', 'party', 'indoor', 'gem', 'dash', 'worked', 'wanted', 'visiting', 'usually', 'tough', 'support', 'short', 'phase', 'park', 'pandemic', 'ordering', 'love', 'even', 'enjoyed', 'easily', 'decided', 'clovis', 'awesome', 'absolutely', 'view', 'updated', 'tacos']


**Most explanatory words:** ['wonderful', 'update', 'seating', 'quarantine', 'party', 'indoor', 'gem', 'dash', 'worked', 'wanted', 'visiting', 'usually', 'tough', 'support', 'short', 'phase', 'park', 'pandemic', 'ordering', 'love', 'even', 'enjoyed', 'easily', 'decided', 'clovis', 'awesome', 'absolutely', 'view', 'updated', 'tacos']

# Sentiment Prediction with SciKit Learn Library

### import modules

In [3]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [None]:
pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

In [4]:
g = open('yelp_review.json')
text = json.load(g)
g = open('yelp_sentiment.json')
sentiment = json.load(g)
d = {'Reviews': text, 'Sentiment': sentiment}
df = pd.DataFrame(d)
df.head()

Unnamed: 0,Reviews,Sentiment
0,Balboa Park is definitely one place to visit i...,1
1,Even though it was gloomy and breezy out! It d...,1
2,World class gem of a park. I have been here b...,1
3,Drove by and saw the crazy long line at Phil's...,1
4,Awesome service we order our food and when It ...,1


In [5]:
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

def get_clean(text):
    '''
    Helper function for bag_of_words()
    Inputs a text string
    '''
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    sentence = ""
    for word in words:
        sentence = sentence + word + " "
    return sentence

### data preprocessing

In [6]:
df['Reviews'] = df['Reviews'].apply(lambda x: get_clean(x))
df.head()

Unnamed: 0,Reviews,Sentiment
0,balboa park definitely one place visit ever sa...,1
1,even though gloomy breezy nt stop us visiting ...,1
2,world class gem park time walked around park s...,1
3,drove saw crazy long line phil great sign deci...,1
4,awesome service order food came thought order ...,1


In [7]:
tfidf = TfidfVectorizer(max_features=3000)
X = df['Reviews']
y = df['Sentiment']

X = tfidf.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 13)

In [9]:
clf = LinearSVC()
clf.fit(X_train, y_train)

LinearSVC()

In [10]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.61      0.48      0.54       608
           1       0.82      0.89      0.85      1619

    accuracy                           0.77      2227
   macro avg       0.72      0.68      0.69      2227
weighted avg       0.76      0.77      0.77      2227



In [11]:
x = 'The restaurant has a very nice environment'
#x = 'The waiter here is rude and slow'
#x = 'I like the vegan options a lot'
#x = 'The store requires masks and I hate that'
x = get_clean(x)
vec = tfidf.transform([x])
clf.predict(vec)

array([1])

# Location visualization with folium

In [None]:
pip install folium

In [27]:
import folium
import requests
import json
from YelpAPI import get_my_key


m = folium.Map(location=[32.7157, -117.1611], zoom_start=12, tiles="Stamen Terrain")

API_KEY = get_my_key()
g = open('yelp.json')
yelpdata = json.load(g)
id_list = list(yelpdata.keys())
for element in id_list:
    business_id = element
    ENDPOINT = 'https://api.yelp.com/v3/businesses/{}'.format(business_id)
    HEADERS = {'Authorization': 'bearer %s' % API_KEY}
    response = requests.get(url = ENDPOINT, headers= HEADERS)
    biz = response.json()
    if 'error' in biz:
        continue
    if biz['coordinates']['latitude'] == None:
        continue
    if biz['coordinates']['longitude'] == None:
        continue
    if biz['name'] == None:
        continue
    name = biz['name']
    la = biz['coordinates']['latitude']
    lon = biz['coordinates']['longitude']
    rating = biz['rating']
    if rating > 4.7:
        col = 'darkgreen'
    elif rating > 4.2:
        col = 'green'
    elif rating > 3.7:
        col = 'orange'
    elif rating > 3.2:
        col = 'lightred'
    else:
        col = 'red'
    folium.Marker(
    location=[la, lon],
    popup=name,
    icon=folium.Icon(color=col),
    ).add_to(m)
    
m.save("update.html")

In [28]:
print(biz)

{'error': {'code': 'ACCESS_LIMIT_REACHED', 'description': "You've reached the access limit for this client. See instructions for requesting a higher access limit at https://www.yelp.com/developers/documentation/v3/rate_limiting"}}


## restaurant data

In [None]:
import folium
import requests
import json
from YelpAPI import get_my_key


m = folium.Map(location=[32.7157, -117.1611], zoom_start=12, tiles="Stamen Terrain")

API_KEY = get_my_key()
g = open('restaurant.json')
yelpdata = json.load(g)
id_list = list(yelpdata.keys())
for element in id_list:
    business_id = element
    ENDPOINT = 'https://api.yelp.com/v3/businesses/{}'.format(business_id)
    HEADERS = {'Authorization': 'bearer %s' % API_KEY}
    response = requests.get(url = ENDPOINT, headers= HEADERS)
    biz = response.json()
    if 'error' in biz:
        continue
    if biz['coordinates']['latitude'] == None:
        continue
    if biz['coordinates']['longitude'] == None:
        continue
    if biz['name'] == None:
        continue
    name = biz['name']
    la = biz['coordinates']['latitude']
    lon = biz['coordinates']['longitude']
    rating = biz['rating']
    if rating > 4.7:
        col = 'darkgreen'
    elif rating > 4.2:
        col = 'green'
    elif rating > 3.7:
        col = 'orange'
    elif rating > 3.2:
        col = 'lightred'
    else:
        col = 'red'
    folium.Marker(
    location=[la, lon],
    popup=name,
    icon=folium.Icon(color=col),
    ).add_to(m)
    
m.save("restaurant.html")