## Import Saved Data

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer


inputs = pd.read_csv('firm_data/inputs.csv', usecols=[1,2,3])
name = inputs['company_name'][0]
search_terms = inputs['search_terms'][0]
location = inputs['location'][0]

raw_data = pd.read_csv(f'firm_data/{inputs["company_name"][0]}_search_results.csv', 
                       usecols=[2, 3, 5, 6, 7, 8, 9, 12, 16])

data = raw_data.copy()
df = data[data.is_closed==False].drop('is_closed', axis=1)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,alias,name,url,review_count,categories,rating,price,distance
0,tacuba-astoria,Tacuba,https://www.yelp.com/biz/tacuba-astoria?adjust...,645,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",3.5,$$,1229.351705
1,mi-espiguita-taqueria-astoria,Mi Espiguita Taqueria,https://www.yelp.com/biz/mi-espiguita-taqueria...,113,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,$,714.30108
2,el-mero-mero-long-island-city,El Mero Mero,https://www.yelp.com/biz/el-mero-mero-long-isl...,81,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,$$,1457.114073
3,the-calaveras-new-york,The Calaveras,https://www.yelp.com/biz/the-calaveras-new-yor...,103,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.5,$$,4963.939653
4,athens-grill-and-sports-bar-astoria,Athens Grill & Sports Bar,https://www.yelp.com/biz/athens-grill-and-spor...,191,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.0,$,42.777637


## Clean Data
- Create dummies for categories
- Convert price to numerical
- Replace N/A values

In [2]:
# Get list of all unique category tags
tags=[]
for cat in df['categories']:
    cat = ast.literal_eval(cat)
    for tag in cat:
        tags.append(tag['title'])

        
# Price to numerical & fill null values with mean
df.price.replace(to_replace = ['$', '$$', '$$$', '$$$$'], 
                 value = [1, 2, 3, 4], 
                 inplace=True)

df.price.fillna(df.price.mean(), inplace=True)

        
# Create tags column
df['tags'] = ''
for ix in df.index:
    cat = ast.literal_eval(df['categories'][ix])
    words = ''
    for tag in cat:
        words += f"{tag['title']} "
    df.loc[ix, 'tags'] = words + str(np.round(df['price'][ix], 1))


df.head()

Unnamed: 0,alias,name,url,review_count,categories,rating,price,distance,tags
0,tacuba-astoria,Tacuba,https://www.yelp.com/biz/tacuba-astoria?adjust...,645,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",3.5,2.0,1229.351705,Mexican Tapas Bars Latin American 2.0
1,mi-espiguita-taqueria-astoria,Mi Espiguita Taqueria,https://www.yelp.com/biz/mi-espiguita-taqueria...,113,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,1.0,714.30108,Mexican 1.0
2,el-mero-mero-long-island-city,El Mero Mero,https://www.yelp.com/biz/el-mero-mero-long-isl...,81,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,2.0,1457.114073,Mexican 2.0
3,the-calaveras-new-york,The Calaveras,https://www.yelp.com/biz/the-calaveras-new-yor...,103,"[{'alias': 'bars', 'title': 'Bars'}, {'alias':...",4.5,2.0,4963.939653,Bars Mexican Tapas/Small Plates 2.0
4,athens-grill-and-sports-bar-astoria,Athens Grill & Sports Bar,https://www.yelp.com/biz/athens-grill-and-spor...,191,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.0,1.0,42.777637,Mexican 1.0


## K-Mean Clustering

In [3]:
def get_competitors(df, name):
    # Convert tags to vectors
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df.tags)
    
    # Divide into two groups using k-mean clustering
    model = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=100)
    model.fit(X)
    
    # Create group label column
    df['group'] = list(model.labels_)
    
    # Filter records in different group than company of interest
    group = df[df['name']==name]['group'].values[0]
    new_df = df[df['group'] == group].reset_index(drop=True)
    
    # Keep splitting dataframe until there are 10 or fewer records
    n = new_df.shape[0]
    while n>10:
        new_df = get_competitors(new_df, name)
        n=new_df.shape[0]
        
        # If final group too small, try again with new random starting point
        if n<6:
            new_df=df.copy()
            n=new_df.shape[0]
        
    return new_df

In [4]:
competitors_df = get_competitors(df, name)
competitors_df.to_csv('firm_data/competitors.csv')
competitors_df

Unnamed: 0,alias,name,url,review_count,categories,rating,price,distance,tags,group
0,las-catrinas-mexican-bar-and-eatery-astoria,Las Catrinas Mexican Bar & Eatery,https://www.yelp.com/biz/las-catrinas-mexican-...,301,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,2.0,642.525771,Mexican Cocktail Bars 2.0,1
1,chanos-cantina-astoria,Chano's Cantina,https://www.yelp.com/biz/chanos-cantina-astori...,165,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...",4.0,2.0,1290.427488,Cocktail Bars New Mexican Cuisine 2.0,1
2,maizal-restaurant-and-tequila-bar-astoria-2,Maizal Restaurant & Tequila Bar,https://www.yelp.com/biz/maizal-restaurant-and...,295,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,2.0,900.451091,Mexican Cocktail Bars 2.0,1
3,corazon-de-mexico-restaurant-and-bar-long-isla...,Corazon De Mexico Restaurant & Bar,https://www.yelp.com/biz/corazon-de-mexico-res...,54,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,2.0,3761.460081,Mexican Cocktail Bars Venues & Event Spaces 2.0,1
4,la-chonas-astoria,La Chona's,https://www.yelp.com/biz/la-chonas-astoria?adj...,15,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",3.5,1.822222,874.273703,Mexican Bars 1.8,1
5,two-lizards-mexican-bar-and-grill-new-york,Two Lizards Mexican Bar & Grill,https://www.yelp.com/biz/two-lizards-mexican-b...,462,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",3.5,2.0,2815.273152,Mexican Bars 2.0,1
