## Exploratory analysis of Attributes & Categories
In this document we look at the various attributes and categories of the buisnesses in the yelp dataset.

In [2]:
import json
import itertools as it
import pandas as pd

In [3]:
businesses = sc.textFile('../data/raw/yelp_academic_dataset_business.json') \
    .map(lambda row: json.loads(row)) \
    .map(lambda row: (row['business_id'], row)) \
    .cache()

In [4]:
businesses.count()

188593

In [5]:
categories = businesses.filter(lambda row: row[1]['categories'] is not None) \
    .flatMap(lambda row: zip((x.strip() for x in row[1]['categories'].split(',')), it.repeat(row[0]))) \
    .cache()

In [6]:
category_counts = pd.Series(categories.countByKey())

In [7]:
category_counts.sort_values(ascending=False).head(n=20)

Restaurants                  57173
Shopping                     30231
Food                         27118
Beauty & Spas                18967
Home Services                18634
Health & Medical             16157
Local Services               12906
Automotive                   12656
Nightlife                    12438
Bars                         10853
Event Planning & Services     9774
Active Life                   9119
Fashion                       7406
Coffee & Tea                  6936
Sandwiches                    6912
Hair Salons                   6825
Fast Food                     6812
American (Traditional)        6659
Pizza                         6603
Home & Garden                 6020
dtype: int64

In [8]:
category_cooccurence = businesses.filter(lambda row: row[1]['categories'] is not None) \
    .map(lambda row: [x.strip() for x in row[1]['categories'].split(',')]) \
    .filter(lambda row: 'Restaurants' in row) \
    .flatMap(lambda row: [(x, 1) for i in range(2, 5) for x in it.combinations(row, i)]) \
    .countByKey()

pd.Series(category_cooccurence).sort_values(ascending=False).head(50)

Restaurants             Food                      5860
Food                    Restaurants               5807
Nightlife               Restaurants               4014
Restaurants             Nightlife                 3934
                        Bars                      3922
Nightlife               Bars                      3885
Bars                    Nightlife                 3761
                        Restaurants               3724
Fast Food               Restaurants               3479
Restaurants             Sandwiches                3477
Sandwiches              Restaurants               3435
Restaurants             American (Traditional)    3392
                        Fast Food                 3333
                        Pizza                     3311
Pizza                   Restaurants               3292
American (Traditional)  Restaurants               3267
Restaurants             Burgers                   2655
                        Breakfast & Brunch        2602
Burgers   

In [9]:
non_restaurants = pd.Series(businesses.filter(lambda row: row[1]['categories'] is not None) \
    .map(lambda row: [x.strip() for x in row[1]['categories'].split(',')]) \
    .filter(lambda row: 'Restaurants' not in row) \
    .flatMap(lambda row: zip(row, it.repeat(1)))
    .countByKey())

non_restaurants.sort_values(ascending=False).head(n=50)

Shopping                     29786
Beauty & Spas                18897
Home Services                18584
Health & Medical             16109
Food                         15451
Local Services               12838
Automotive                   12586
Active Life                   8885
Event Planning & Services     7527
Fashion                       7360
Hair Salons                   6801
Home & Garden                 5954
Auto Repair                   5868
Professional Services         5589
Hotels & Travel               5500
Doctors                       5444
Real Estate                   5278
Arts & Entertainment          4978
Nail Salons                   4828
Fitness & Instruction         4538
Nightlife                     4490
Pets                          3982
Coffee & Tea                  3828
Hair Removal                  3759
Dentists                      3434
Skin Care                     3226
Bars                          3207
Education                     2986
Financial Services  

In [10]:
non_restaurants_and_food_and_bars = pd.Series(businesses.filter(lambda row: row[1]['categories'] is not None) \
    .map(lambda row: [x.strip() for x in row[1]['categories'].split(',')]) \
    .filter(lambda row: 'Restaurants' not in row and 'Food' not in row and 'Bars' not in row) \
    .flatMap(lambda row: zip(row, it.repeat(1)))
    .countByKey())

non_restaurants_and_food_and_bars.sort_values(ascending=False).head(n=50)

Shopping                     27496
Home Services                18524
Beauty & Spas                18488
Health & Medical             15880
Local Services               12714
Automotive                   11776
Active Life                   8721
Fashion                       7011
Event Planning & Services     6845
Hair Salons                   6795
Auto Repair                   5861
Home & Garden                 5819
Professional Services         5537
Doctors                       5428
Hotels & Travel               5333
Real Estate                   5268
Nail Salons                   4826
Fitness & Instruction         4490
Arts & Entertainment          4266
Pets                          3949
Hair Removal                  3754
Dentists                      3434
Skin Care                     3211
Financial Services            2921
Education                     2911
Contractors                   2839
Women's Clothing              2790
Pet Services                  2740
General Dentistry   

In [11]:
selected_businesses_count = categories.filter(lambda row: row[0] in {'Restaurants', 'Food', 'Bar'}) \
    .map(lambda row: row[1]) \
    .distinct().count()

In [12]:
selected_businesses_count

72624

In [15]:
attributes = businesses.filter(lambda row: row[1]['attributes'] is not None) \
    .flatMap(lambda row: row[1]['attributes'].keys()) \
    .distinct().collect()

In [16]:
attributes

['RestaurantsGoodForGroups',
 'RestaurantsReservations',
 'RestaurantsTakeOut',
 'Caters',
 'BYOBCorkage',
 'BusinessAcceptsBitcoin',
 'Open24Hours',
 'DietaryRestrictions',
 'OutdoorSeating',
 'RestaurantsDelivery',
 'DriveThru',
 'Ambience',
 'BYOB',
 'Music',
 'Smoking',
 'RestaurantsCounterService',
 'BusinessAcceptsCreditCards',
 'GoodForKids',
 'HasTV',
 'DogsAllowed',
 'RestaurantsTableService',
 'AcceptsInsurance',
 'BusinessParking',
 'RestaurantsAttire',
 'RestaurantsPriceRange2',
 'GoodForMeal',
 'WheelchairAccessible',
 'BestNights',
 'CoatCheck',
 'Corkage',
 'GoodForDancing',
 'HappyHour',
 'HairSpecializesIn',
 'AgesAllowed',
 'BikeParking',
 'NoiseLevel',
 'Alcohol',
 'WiFi',
 'ByAppointmentOnly']