In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
import re

In [2]:
# Prep data for regression
business = pd.read_csv('data/az_business.csv')
mexican = business[business['mexican'] == True]
labels = mexican['stars']
data = mexican.drop(['stars', 'star_category', 'food', 'mexican', 'business_id'], axis=1)
data = data.drop(['city','categories'], axis=1)
# Drop time since not good format
drop_cols = [not bool(re.match('hour', str(x))) for x in data.columns]
data = data.loc[:, drop_cols]

In [3]:
cols = ['name', 'attributes.Price Range','review_count',
        'latitude', 'longitude','attributes.Accepts Credit Cards',
        'attributes.Take-out', 'attributes.Delivery',
        'attributes.Wheelchair Accessible', 'attributes.Good For.lunch',
       'attributes.Good For.dinner', 'attributes.Good For.breakfast',
       'attributes.Ambience.intimate', 'attributes.Takes Reservations']
data = data[cols]

# Make One-Hot Vectors
for att in cols:
    if re.match('att*', att):
        title = att.partition('.')[-1]
        if title != 'Price Range':
            dummies = pd.get_dummies(data[att])
            data['not ' + title] = dummies[0]
            data[title] = dummies[1]
            data = data.drop(att, axis=1)
        else:
            dummies = pd.get_dummies(data[att])
            data['Price 1'] = dummies[1]
            data['Price 2'] = dummies[2]
            data['Price 3'] = dummies[3]
            data['Price 4'] = dummies[4]
            data = data.drop(att, axis=1)

In [4]:
data.head()

Unnamed: 0,name,review_count,latitude,longitude,Price 1,Price 2,Price 3,Price 4,not Accepts Credit Cards,Accepts Credit Cards,...,not Good For.lunch,Good For.lunch,not Good For.dinner,Good For.dinner,not Good For.breakfast,Good For.breakfast,not Ambience.intimate,Ambience.intimate,not Takes Reservations,Takes Reservations
19,Canyon Cafe,280,33.452154,-112.06862,0,1,0,0,0,1,...,1,0,0,1,1,0,1,0,0,1
29,Taco Bell,12,33.46544,-112.06909,1,0,0,0,0,1,...,0,1,1,0,1,0,1,0,1,0
41,Comedor Guadalajara,325,33.429424,-112.073929,0,1,0,0,0,1,...,0,1,1,0,1,0,1,0,1,0
44,Chico's Tacos,98,33.465712,-112.065729,1,0,0,0,0,1,...,0,1,1,0,1,0,1,0,1,0
51,Matador Restaurant,57,33.449309,-112.07194,0,1,0,0,0,1,...,0,1,0,1,1,0,1,0,0,1


In [5]:
# Kmeans on lat/long
kmean = KMeans(n_clusters=15)
kmean.fit(data[['latitude', 'longitude']])
centers = kmean.cluster_centers_

In [6]:
# one hot location
col_names = data.columns.tolist()
data = pd.DataFrame(np.hstack((data.as_matrix(), pd.get_dummies(kmean.labels_).as_matrix())))
data.columns = col_names + range(1,16)

In [9]:
data.to_csv('data/data.csv', index=None)
labels.to_csv('data/labels.csv', index=None)

In [16]:
pd.read_csv('data/labels.csv'

(1429, 1)