## Import Saved Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas_profiling as profile

inputs = pd.read_csv('firm_data/inputs.csv', usecols=[1,2,3])

raw_data = pd.read_csv(f'firm_data/{inputs["company_name"][0]}_search_results.csv', 
                       usecols=[3, 5, 6, 7, 8, 9, 12, 16])

data = raw_data.copy()
df = data[data.is_closed==False].drop('is_closed', axis=1)
df.head()

Unnamed: 0,name,url,review_count,categories,rating,price,distance
0,Las Catrinas Mexican Bar & Eatery,https://www.yelp.com/biz/las-catrinas-mexican-...,301,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,$$,642.525771
1,De Mole Astoria,https://www.yelp.com/biz/de-mole-astoria-astor...,362,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.0,$$,918.092772
2,Hoja Santa,https://www.yelp.com/biz/hoja-santa-astoria?ad...,33,"[{'alias': 'newmexican', 'title': 'New Mexican...",4.5,,751.531469
3,Chela & Garnacha,https://www.yelp.com/biz/chela-and-garnacha-as...,364,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.5,$$,1318.326547
4,Fresco's Cantina,https://www.yelp.com/biz/frescos-cantina-long-...,209,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.5,$$,927.218713


## Clean Data
- Create dummies for categories
- Convert price to numerical
- Replace N/A values

In [2]:
# Get list of all unique category tags
tags=[]
for cat in df['categories']:
    cat = ast.literal_eval(cat)
    for tag in cat:
        tags.append(tag['title'])

        
# Price to numerical & fill null values with mean
df.price.replace(to_replace = ['$', '$$', '$$$', '$$$$'], 
                 value = [1, 2, 3, 4], 
                 inplace=True)

df.price.fillna(df.price.mean(), inplace=True)

        
# Create tags column
df['tags'] = ''
for ix in df.index:
    cat = ast.literal_eval(df['categories'][ix])
    words = ''
    for tag in cat:
        words += f"{tag['title']} "
    df.loc[ix, 'tags'] = words + str(np.round(df['price'][ix], 1))


df.head()
# profile.ProfileReport(df)

Unnamed: 0,name,url,review_count,categories,rating,price,distance,tags
0,Las Catrinas Mexican Bar & Eatery,https://www.yelp.com/biz/las-catrinas-mexican-...,301,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,2.0,642.525771,Mexican Cocktail Bars 2.0
1,De Mole Astoria,https://www.yelp.com/biz/de-mole-astoria-astor...,362,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.0,2.0,918.092772,Mexican 2.0
2,Hoja Santa,https://www.yelp.com/biz/hoja-santa-astoria?ad...,33,"[{'alias': 'newmexican', 'title': 'New Mexican...",4.5,1.744186,751.531469,New Mexican Cuisine Mexican 1.7
3,Chela & Garnacha,https://www.yelp.com/biz/chela-and-garnacha-as...,364,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.5,2.0,1318.326547,Mexican Wine Bars Breakfast & Brunch 2.0
4,Fresco's Cantina,https://www.yelp.com/biz/frescos-cantina-long-...,209,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.5,2.0,927.218713,Mexican Latin American New Mexican Cuisine 2.0


## K-Mean Clustering

In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df.tags)

In [4]:
true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)
df['group'] = list(model.labels_)
df.head()

Unnamed: 0,name,url,review_count,categories,rating,price,distance,tags,group
0,Las Catrinas Mexican Bar & Eatery,https://www.yelp.com/biz/las-catrinas-mexican-...,301,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.0,2.0,642.525771,Mexican Cocktail Bars 2.0,3
1,De Mole Astoria,https://www.yelp.com/biz/de-mole-astoria-astor...,362,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.0,2.0,918.092772,Mexican 2.0,1
2,Hoja Santa,https://www.yelp.com/biz/hoja-santa-astoria?ad...,33,"[{'alias': 'newmexican', 'title': 'New Mexican...",4.5,1.744186,751.531469,New Mexican Cuisine Mexican 1.7,4
3,Chela & Garnacha,https://www.yelp.com/biz/chela-and-garnacha-as...,364,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.5,2.0,1318.326547,Mexican Wine Bars Breakfast & Brunch 2.0,2
4,Fresco's Cantina,https://www.yelp.com/biz/frescos-cantina-long-...,209,"[{'alias': 'mexican', 'title': 'Mexican'}, {'a...",4.5,2.0,927.218713,Mexican Latin American New Mexican Cuisine 2.0,4


In [5]:
df['group'].value_counts()

1    21
0    13
3     8
4     5
2     3
Name: group, dtype: int64

In [6]:
for i in range(5):
    print(f'Group {i}:')
    for name in df['name'][df.group==i].sort_values():
        print(f' {name}')
    print('\n')

Group 0:
 Acento Latin Kitchen and Bar
 Casa Rubio Cafe
 East Coast Street Tacos
 Emilia's Pizza & Authentic Mexican Food
 La Cabana
 La Chona's
 Las Margaritas Astoria Restaurant
 Los Amigos Mexican Restaurant
 Mezquite Restaurant
 Mojave
 Tacuba
 Toloache 82nd
 Twisted Mexican Grill


Group 1:
 Athens Grill & Sports Bar
 Casa Enrique
 Cemitas El Tigre
 Chihuahua
 De Mole
 De Mole Astoria
 El Buen Gusto Restaurant
 El Mariachi Restaurant
 El Mero Mero
 El Rey Restaurant
 Hidalgo Mexican Food
 Los Portales
 Manjares Mexico
 Mi Espiguita Taqueria
 Noche De Margaritas
 Orale Mex Food
 Oxomoco
 St James Deli
 Tacos Mexico
 Taqueria Coatzingo
 Taqueria Santa Fe


Group 2:
 Chela & Garnacha
 Mexicue
 Swick 2


Group 3:
 Calexico - Upper East Side
 Conmigo
 Corazon De Mexico Restaurant & Bar
 Homemade Taqueria
 Juquila Kitchen and Bar
 Las Catrinas Mexican Bar & Eatery
 Maizal Restaurant & Tequila Bar
 Orale Tacos


Group 4:
 Chano's Cantina
 Fresco's Cantina
 Hoja Santa
 Lunera Modern Mexic

In [7]:
model.predict(vectorizer.transform([inputs["search_terms"][0]]))

array([0], dtype=int32)