## Import Saved Data

In [1]:
import pandas as pd
import numpy as np
import ast
import re
from sklearn.cluster import KMeans
from nltk import word_tokenize
import requests
from bs4 import BeautifulSoup as bs

inputs = pd.read_csv('firm_data/inputs.csv', usecols=[1,2,3])
name = inputs['company_name'][0]
search_terms = inputs['search_terms'][0]
location = inputs['location'][0]

raw_data = pd.read_csv(f'firm_data/{inputs["company_name"][0]}_search_results.csv', 
                       usecols=[2, 3, 5, 6, 7, 8, 9, 12, 16])

data = raw_data.copy()
df = data[data.is_closed==False].drop('is_closed', axis=1)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,alias,name,url,review_count,categories,rating,price,distance
0,tacuba-astoria,Tacuba,https://www.yelp.com/biz/tacuba-astoria?adjust...,645,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",3.5,$$,1229.351705
1,mi-espiguita-taqueria-astoria,Mi Espiguita Taqueria,https://www.yelp.com/biz/mi-espiguita-taqueria...,113,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,$,714.30108
2,el-mero-mero-long-island-city,El Mero Mero,https://www.yelp.com/biz/el-mero-mero-long-isl...,81,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,$$,1457.114073
3,hoja-santa-astoria,Hoja Santa,https://www.yelp.com/biz/hoja-santa-astoria?ad...,33,"[{'alias': 'newmexican', 'title': 'New Mexican...",4.5,,751.531469
4,the-calaveras-new-york,The Calaveras,https://www.yelp.com/biz/the-calaveras-new-yor...,103,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.5,$$,4963.939653


## Clean Data
- Create dummies for categories
- Convert price to numerical
- Replace N/A values

In [2]:
' '.join(['string1', 'string2'])

'string1 string2'

In [3]:
# Get list of all unique category tags
tags=[]
for cat in df['categories']:
    cat = ast.literal_eval(cat)
    for tag in cat:
        tags.append(tag['title'])

        
# Price to numerical & fill null values with mean
df.price.replace(to_replace = ['$', '$$', '$$$', '$$$$'], 
                 value = [1, 2, 3, 4], 
                 inplace=True)

df.price.fillna(np.round(df.price.mean(), 0), inplace=True)


# Create tags column
df['tags'] = ''
for ix in df.index:
    # Evaluate categories as code
    cat = ast.literal_eval(df['categories'][ix])
    
    # Save tags as single string
    words = ''
    for tag in cat:
        words += f"{tag['title']} "
    
    # Remove punctuation
    words = re.sub(r'[^\w\s]', ' ', words)
    df.loc[ix, 'tags'] = words.lower() + ' ' + str(int(df.loc[ix, 'price']))

df['tags'] = df['tags'].map(word_tokenize).values
# df['tags'] = data
df.head()

Unnamed: 0,alias,name,url,review_count,categories,rating,price,distance,tags
0,tacuba-astoria,Tacuba,https://www.yelp.com/biz/tacuba-astoria?adjust...,645,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",3.5,2.0,1229.351705,"[mexican, tapas, bars, latin, american, 2]"
1,mi-espiguita-taqueria-astoria,Mi Espiguita Taqueria,https://www.yelp.com/biz/mi-espiguita-taqueria...,113,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,1.0,714.30108,"[mexican, 1]"
2,el-mero-mero-long-island-city,El Mero Mero,https://www.yelp.com/biz/el-mero-mero-long-isl...,81,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,2.0,1457.114073,"[mexican, 2]"
3,hoja-santa-astoria,Hoja Santa,https://www.yelp.com/biz/hoja-santa-astoria?ad...,33,"[{'alias': 'newmexican', 'title': 'New Mexican...",4.5,2.0,751.531469,"[new, mexican, cuisine, mexican, 2]"
4,the-calaveras-new-york,The Calaveras,https://www.yelp.com/biz/the-calaveras-new-yor...,103,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.5,2.0,4963.939653,"[bars, mexican, tapas, small, plates, 2]"


## Word Vectorization with GloVe

In [4]:
total_vocabulary = set(word.lower() for tag in df['tags'] for word in tag)
print(f'There are {len(total_vocabulary)} unique tokens in the dataset.')

There are 39 unique tokens in the dataset.


In [5]:
from nltk import word_tokenize
import requests
from bs4 import BeautifulSoup as bs

def get_definition(word):
    # Get webpage data for word definition
    url = f'https://www.merriam-webster.com/dictionary/{word}'
    r = requests.get(url)
    
    # Parse html
    soup = bs(r.content, 'lxml')
    definition = soup.find("span", {"class" : "dt"})
    tag = definition.findChild()
    definition = tag.find('strong').next_sibling.strip()
    
    # Clean text & return tokenized definition
    clean_def = re.sub(r'[^\w\s]', ' ', definition)
    return word_tokenize(clean_def)

In [6]:
def get_vectors(vocabulary):
    glove = {}
    with open('glove.6B.50d.txt', 'rb') as f:
        for line in f:
            parts = line.split()
            word = parts[0].decode('utf-8')
            if word in vocabulary:
                vector = np.array(parts[1:], dtype=np.float32)
                glove[word] = vector
                
    return glove

glove = get_vectors(total_vocabulary)

# Get definitions for words not in Glove
no_vectors = {}
for word in total_vocabulary:
    if word not in glove.keys():
        no_vectors[word] = get_definition(word)

In [7]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        # Get mean value of vectors from a list of words
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                    or [np.zeros(self.dimensions)], axis=0) 
                    for words in X])

In [8]:
# Add unique words from tokenized definitions to GloVe dictionary
for key in no_vectors.keys():
    words_to_add = []
    for word in no_vectors[key]:
        if word not in list(glove.keys()):
            words_to_add.append(word)
            
    words_to_add = list(set(words_to_add))
    new_vects = get_vectors(words_to_add)
    glove.update(new_vects)

In [9]:
vectors = W2vVectorizer(glove)
for key in no_vectors.keys():
    vect = vectors.transform(no_vectors[key])
    glove[key] = np.average(vect, axis=0)
    
print(sorted(list(glove.keys())))

['1', '2', '3', 'a', 'american', 'bar', 'bars', 'beer', 'breakfast', 'brunch', 'chicken', 'cocktail', 'coffee', 'cuisine', 'event', 'gastropubs', 'grocery', 'high', 'irish', 'italian', 'latin', 'lounges', 'meals', 'mex', 'mexican', 'new', 'of', 'offers', 'or', 'peruvian', 'plates', 'pub', 'pubs', 'quality', 'seafood', 'small', 'southern', 'spaces', 'spirits', 'sports', 'tacos', 'tapas', 'tavern', 'tea', 'tex', 'that', 'traditional', 'venues', 'wine', 'wings']


## K-Mean Clustering

In [10]:
from sklearn.metrics import silhouette_score
    
def get_competitors(df, name):
    num_attempts = 0
    
    # Convert tags to vectors
    vectors = W2vVectorizer(glove)
    X = vectors.transform(df['tags'])
    
    # Find optimal number of clusters
    sil = []
    kmax = 10

    # dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
    for k in range(2, kmax+1):
        kmeans = KMeans(n_clusters = k).fit(X)
        labels = kmeans.labels_
        sil.append(silhouette_score(X, labels, metric = 'euclidean'))
        
    maxpos = sil.index(max(sil))
    n_clusters = maxpos + 2
    
    # Divide into two groups using k-mean clustering
    model = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=100)
    model.fit(X)
    
    # Create group label column
    df['group'] = list(model.labels_)
    
    # Filter records in different group than company of interest
    group = df[df['name'] == name]['group'].values[0]
    new_df = df[df['group'] == group].reset_index(drop=True)
    
    # Keep splitting dataframe until there are 10 or fewer records
    n = new_df.shape[0]
    while n>=10:
        new_df = get_competitors(new_df, name)
        n=new_df.shape[0]
        
        # If final group too small, try again with new random starting point
        # 10 attempts maximum
        if n<=5 and num_attempts < 10:
            new_df=df.copy()
            n=new_df.shape[0]
            num_attempts += 1
        
    return new_df

In [11]:
competitors_df = get_competitors(df, name)
competitors_df.to_csv('firm_data/competitors.csv')
competitors_df

Unnamed: 0,alias,name,url,review_count,categories,rating,price,distance,tags,group
0,the-calaveras-new-york,The Calaveras,https://www.yelp.com/biz/the-calaveras-new-yor...,103,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.5,2.0,4963.939653,"[bars, mexican, tapas, small, plates, 2]",4
1,las-catrinas-mexican-bar-and-eatery-astoria,Las Catrinas Mexican Bar & Eatery,https://www.yelp.com/biz/las-catrinas-mexican-...,301,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,2.0,642.525771,"[mexican, cocktail, bars, 2]",4
2,chanos-cantina-astoria,Chano's Cantina,https://www.yelp.com/biz/chanos-cantina-astori...,165,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...",4.0,2.0,1290.427488,"[cocktail, bars, new, mexican, cuisine, 2]",4
3,maizal-restaurant-and-tequila-bar-astoria-2,Maizal Restaurant & Tequila Bar,https://www.yelp.com/biz/maizal-restaurant-and...,295,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,2.0,900.451091,"[mexican, cocktail, bars, 2]",4
4,juquila-kitchen-and-bar-long-island-city,Juquila Kitchen and Bar,https://www.yelp.com/biz/juquila-kitchen-and-b...,98,"[{'alias': 'newmexican', 'title': 'New Mexican...",4.0,2.0,1829.249006,"[new, mexican, cuisine, tacos, cocktail, bars, 2]",4
