## Import Saved Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

inputs = pd.read_csv('firm_data/inputs.csv', usecols=[1,2,3])
name = inputs['company_name'][0]
search_terms = inputs['search_terms'][0]
location = inputs['location'][0]

raw_data = pd.read_csv(f'firm_data/{inputs["company_name"][0]}_search_results.csv', 
                       usecols=[3, 5, 6, 7, 8, 9, 12, 16])

data = raw_data.copy()
df = data[data.is_closed==False].drop('is_closed', axis=1)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,name,url,review_count,categories,rating,price,distance
0,El Mero Mero,https://www.yelp.com/biz/el-mero-mero-long-isl...,81,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,$$,1457.114073
1,Las Catrinas Mexican Bar & Eatery,https://www.yelp.com/biz/las-catrinas-mexican-...,301,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,$$,642.525771
2,Chela & Garnacha,https://www.yelp.com/biz/chela-and-garnacha-as...,364,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.5,$$,1318.326547
3,Chano's Cantina,https://www.yelp.com/biz/chanos-cantina-astori...,164,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...",4.0,$$,1290.427488
4,La Sala,https://www.yelp.com/biz/la-sala-astoria?adjus...,117,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...",4.5,$$,678.809168


## Clean Data
- Create dummies for categories
- Convert price to numerical
- Replace N/A values

In [2]:
# Get list of all unique category tags
tags=[]
for cat in df['categories']:
    cat = ast.literal_eval(cat)
    for tag in cat:
        tags.append(tag['title'])

        
# Price to numerical & fill null values with mean
df.price.replace(to_replace = ['$', '$$', '$$$', '$$$$'], 
                 value = [1, 2, 3, 4], 
                 inplace=True)

df.price.fillna(df.price.mean(), inplace=True)

        
# Create tags column
df['tags'] = ''
for ix in df.index:
    cat = ast.literal_eval(df['categories'][ix])
    words = ''
    for tag in cat:
        words += f"{tag['title']} "
    df.loc[ix, 'tags'] = words + str(np.round(df['price'][ix], 1))


df.head()

Unnamed: 0,name,url,review_count,categories,rating,price,distance,tags
0,El Mero Mero,https://www.yelp.com/biz/el-mero-mero-long-isl...,81,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,2.0,1457.114073,Mexican 2.0
1,Las Catrinas Mexican Bar & Eatery,https://www.yelp.com/biz/las-catrinas-mexican-...,301,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,2.0,642.525771,Mexican Cocktail Bars 2.0
2,Chela & Garnacha,https://www.yelp.com/biz/chela-and-garnacha-as...,364,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.5,2.0,1318.326547,Mexican Wine Bars Breakfast & Brunch 2.0
3,Chano's Cantina,https://www.yelp.com/biz/chanos-cantina-astori...,164,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...",4.0,2.0,1290.427488,Cocktail Bars New Mexican Cuisine 2.0
4,La Sala,https://www.yelp.com/biz/la-sala-astoria?adjus...,117,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...",4.5,2.0,678.809168,"Cocktail Bars Lounges Beer, Wine & Spirits 2.0"


## K-Mean Clustering

In [3]:
def get_competitors(df, name):
    # Convert tags to vectors
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df.tags)
    
    # Divide into two groups using k-mean clustering
    model = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=100)
    model.fit(X)
    
    # Create group label column
    df['group'] = list(model.labels_)
    
    # Filter records in different group than company of interest
    group = df[df['name']==name]['group'].values[0]
    new_df = df[df['group'] == group].reset_index(drop=True)
    
    # Keep splitting dataframe until there are 10 or fewer records
    n = new_df.shape[0]
    while n>10:
        new_df = get_competitors(new_df, name)
        n=new_df.shape[0]
        
        # If final group too small, try again with new random starting point
        if n<5:
            new_df=df.copy()
            n=new_df.shape[0]
        
    return new_df

In [4]:
competitors_df = get_competitors(df, name)
competitors_df.to_csv('firm_data/competitors.csv')
competitors_df

Unnamed: 0,name,url,review_count,categories,rating,price,distance,tags,group
0,Chano's Cantina,https://www.yelp.com/biz/chanos-cantina-astori...,164,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...",4.0,2.0,1290.427488,Cocktail Bars New Mexican Cuisine 2.0,0
1,Fresco's Cantina,https://www.yelp.com/biz/frescos-cantina-long-...,209,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.5,2.0,927.218713,Mexican Latin American New Mexican Cuisine 2.0,0
2,Lunera Modern Mexican,https://www.yelp.com/biz/lunera-modern-mexican...,112,"[{'alias': 'latin', 'title': 'Latin American'}...",4.0,2.0,986.258835,Latin American New Mexican Cuisine 2.0,0
3,Trestle,https://www.yelp.com/biz/trestle-astoria?adjus...,289,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...",4.0,2.0,700.500655,Cocktail Bars American (New) 2.0,0
4,Juquila Kitchen and Bar,https://www.yelp.com/biz/juquila-kitchen-and-b...,98,"[{'alias': 'newmexican', 'title': 'New Mexican...",4.0,2.0,1829.249006,New Mexican Cuisine Tacos Cocktail Bars 2.0,0
5,The Astorian,https://www.yelp.com/biz/the-astorian-astoria?...,165,"[{'alias': 'gastropubs', 'title': 'Gastropubs'...",4.5,2.0,91.375358,Gastropubs Cocktail Bars American (New) 2.0,0
6,La Pulperia UES,https://www.yelp.com/biz/la-pulperia-ues-new-y...,723,"[{'alias': 'latin', 'title': 'Latin American'}...",4.0,2.0,2758.509541,Latin American Seafood Cocktail Bars 2.0,0
7,Cantina Taqueria & Tequila Bar,https://www.yelp.com/biz/cantina-taqueria-and-...,492,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,2.0,4627.750977,Mexican Latin American Cocktail Bars 2.0,0
