# Attribute Co Occurrence Matrix

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

# Function to be added into helper.py:

In [26]:
def unroll_attributes(data):
    '''
    data: expected to have a column "attributes" that is a dictionary (Yelp business dataset expected)
    
    Returns the dataframe where the attributes column is unrolled. 
    '''
    df = data.copy().dropna(subset=['attributes'])
    
    # get all attribute names
    attr_names = set()
    for d in df.attributes:
        for k in d.keys():
            attr_names.add(k)
    
    # create new column for each attribute
    num_rows = df.shape[0]
    full_attr = df.attributes
    for unique_attr in attr_names: # unique_attr could be 'Alcohol'
        temp = list(np.zeros(num_rows))
        for i, attr_dict in enumerate(full_attr): # first
            temp[i] = attr_dict.get(unique_attr)
            if temp[i] is not None: 
                temp[i] = ast.literal_eval(temp[i])
        df[unique_attr] = temp
        
    # unroll dictionary attributes into columns
    for dict_attr in ['GoodForMeal', 'BusinessParking', 'Ambience']:
        new_col_names = df[dict_attr][0].keys()
        for unique_attr in new_col_names:
            temp = list(np.zeros(num_rows))
            for i, attr_dict in enumerate(df[dict_attr]): # second
                if attr_dict is not None:
                    temp[i] = attr_dict.get(unique_attr)
                else:
                    temp[i] = None
            df[dict_attr + '_' + unique_attr] = temp
    
    return df

## Use np.crosstab

Example below:

In [2]:
df = pd.DataFrame({'a': [1,2,3,4,5,6,7,8,9,10], 'b': [11,22,11,22,33,11,22,44,11,22]})
df

Unnamed: 0,a,b
0,1,11
1,2,22
2,3,11
3,4,22
4,5,33
5,6,11
6,7,22
7,8,44
8,9,11
9,10,22


In [3]:
pd.crosstab(df['a'], df['b'])

b,11,22,33,44
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,0,0
2,0,1,0,0
3,1,0,0,0
4,0,1,0,0
5,0,0,1,0
6,1,0,0,0
7,0,1,0,0
8,0,0,0,1
9,1,0,0,0
10,0,1,0,0


## Read in data

In [19]:
# SPECIFY PATH (depends on where you saved the file!)
path = '/Users/jeremyyeung/Downloads/yelp_dataset/yelp_academic_dataset_business.json'

In [20]:
b_pandas = []
r_dtypes = {"business_id": np.str,
            "name": np.str, 
            "address": np.str, 
            "city": np.str, 
            "state": np.str, 
            "postal code": np.str, 
            "latitude": np.float16, 
            "longitude": np.float16, 
            "stars": np.float16, 
            "review_count": np.int32,
            "is_open": np.int32,
            "attributes": dict,
            "categories": np.ndarray
           }
with open(path, "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=r_dtypes, chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=['latitude', 'longitude', 'address', 'city', 'state', 'postal_code'])
        b_pandas.append(reduced_chunk)
    
data = pd.concat(b_pandas, ignore_index=True)
data.shape

(160585, 8)

Filter out the restaurants only:

In [6]:
restaurants = data[data.categories.str.contains('Restaurants', na=False)]
restaurants = restaurants[['business_id', 'review_count', 'attributes']].dropna(subset=['attributes'])
restaurants.head()

Unnamed: 0,business_id,review_count,attributes
0,6iYb2HFDywm3zjuRg0shjw,86,"{'RestaurantsTableService': 'True', 'WiFi': 'u..."
1,tCbdrRPZA0oiIYSmHG3J0w,126,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt..."
5,D4JtQNTI4X3KcbzacDJsMw,169,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ..."
12,HPA_qyMEddpAEtFof02ixg,39,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ..."
13,ufCxltuh56FF4-ZFZ6cVhg,135,"{'BusinessParking': '{'garage': False, 'street..."


## Create a column for each attribute

First get the attribute names.

In [7]:
attr_names = set()
for d in restaurants.attributes:
    for k in d.keys():
        attr_names.add(k)

Then create the new columns, one for each of the 39 large attribute categories.

We can use `ast.literal_eval(x)` to change strings into their corresponding data types.

In [8]:
# create new column for each attribute
num_rows = restaurants.shape[0]
full_attr = restaurants.attributes
for unique_attr in attr_names: # unique_attr could be 'Alcohol'
    temp = list(np.zeros(num_rows))
    for i, attr_dict in enumerate(full_attr):
        temp[i] = attr_dict.get(unique_attr)
        if temp[i] is not None: 
            temp[i] = ast.literal_eval(temp[i])
    restaurants[unique_attr] = temp

In [9]:
restaurants.head()

Unnamed: 0,business_id,review_count,attributes,RestaurantsCounterService,OutdoorSeating,GoodForDancing,DietaryRestrictions,Music,NoiseLevel,DogsAllowed,...,Corkage,BusinessParking,ByAppointmentOnly,CoatCheck,WiFi,AgesAllowed,RestaurantsReservations,BestNights,GoodForKids,RestaurantsPriceRange2
0,6iYb2HFDywm3zjuRg0shjw,86,"{'RestaurantsTableService': 'True', 'WiFi': 'u...",,True,,,,average,False,...,,"{'garage': False, 'street': True, 'validated':...",,,free,,False,,,2.0
1,tCbdrRPZA0oiIYSmHG3J0w,126,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...",,False,,,,average,,...,,"{'garage': True, 'street': False, 'validated':...",False,,free,,False,,True,2.0
5,D4JtQNTI4X3KcbzacDJsMw,169,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...",,False,,,,average,False,...,,"{'garage': False, 'street': True, 'validated':...",,,no,,True,,True,2.0
12,HPA_qyMEddpAEtFof02ixg,39,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...",,True,,,,average,True,...,,"{'garage': False, 'street': False, 'validated'...",,,free,,False,,True,2.0
13,ufCxltuh56FF4-ZFZ6cVhg,135,"{'BusinessParking': '{'garage': False, 'street...",,,,,,quiet,False,...,,"{'garage': False, 'street': False, 'validated'...",,,,,,,,1.0


## Unroll dictionary attributes into columns

Unroll `GoodForMeal`, `BusinessParking`, and `Ambience` into their own columns titled `attr_subAttr`. 

For example, `BusinessParking_garage` would be a new column name. 

In [10]:
for dict_attr in ['GoodForMeal', 'BusinessParking', 'Ambience']:
    new_col_names = restaurants[dict_attr][0].keys()
    for unique_attr in new_col_names:
        temp = list(np.zeros(num_rows))
        for i, attr_dict in enumerate(restaurants[dict_attr]):
            if attr_dict is not None:
                temp[i] = attr_dict.get(unique_attr)
            else:
                temp[i] = None
        restaurants[dict_attr + '_' + unique_attr] = temp

## Finally we get our co occurence matrix for any two attributes! Try replacing `attr_1` and `attr_2`.

In [28]:
attr_1 = 'WiFi'
attr_2 = 'HasTV'
pd.crosstab(restaurants[attr_1], restaurants[attr_2])

HasTV,0.0,1.0
WiFi,Unnamed: 1_level_1,Unnamed: 2_level_1
free,5532,14188
no,5962,10138
paid,57,190


In [12]:
# pick from the list of columns below
print(restaurants.columns[3:])

Index(['RestaurantsCounterService', 'OutdoorSeating', 'GoodForDancing',
       'DietaryRestrictions', 'Music', 'NoiseLevel', 'DogsAllowed', 'Caters',
       'WheelchairAccessible', 'HappyHour', 'RestaurantsGoodForGroups',
       'Alcohol', 'BYOB', 'HairSpecializesIn', 'Open24Hours',
       'RestaurantsDelivery', 'AcceptsInsurance', 'BikeParking',
       'BusinessAcceptsCreditCards', 'DriveThru', 'Smoking',
       'RestaurantsTableService', 'RestaurantsTakeOut',
       'BusinessAcceptsBitcoin', 'Ambience', 'HasTV', 'RestaurantsAttire',
       'GoodForMeal', 'BYOBCorkage', 'Corkage', 'BusinessParking',
       'ByAppointmentOnly', 'CoatCheck', 'WiFi', 'AgesAllowed',
       'RestaurantsReservations', 'BestNights', 'GoodForKids',
       'RestaurantsPriceRange2', 'GoodForMeal_dessert',
       'GoodForMeal_latenight', 'GoodForMeal_lunch', 'GoodForMeal_dinner',
       'GoodForMeal_brunch', 'GoodForMeal_breakfast', 'BusinessParking_garage',
       'BusinessParking_street', 'BusinessParking_vali

## Important: we need to export this work into a single function in helper.py

Otherwise we can specify `r_dtypes` in the "reading from json" part in order to get the correct values in the columns. The regular `read_json` without specifying the data types changes the boolean values into ones and zeros. 


Save to .json:

In [13]:
restaurants.to_json('restaurant_attr_unrolled.json', orient='records', lines=True)

Load from .json:

In [14]:
restaurants = pd.read_json('restaurant_attr_unrolled.json', orient="records", lines=True)

## Notice the ambience columns got changed to 1's and 0's instead of True and False....

In [15]:
restaurants.head()

Unnamed: 0,business_id,review_count,attributes,RestaurantsCounterService,OutdoorSeating,GoodForDancing,DietaryRestrictions,Music,NoiseLevel,DogsAllowed,...,BusinessParking_valet,Ambience_touristy,Ambience_hipster,Ambience_romantic,Ambience_divey,Ambience_intimate,Ambience_trendy,Ambience_upscale,Ambience_classy,Ambience_casual
0,6iYb2HFDywm3zjuRg0shjw,86,"{'RestaurantsTableService': 'True', 'WiFi': 'u...",,1.0,,,,average,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,tCbdrRPZA0oiIYSmHG3J0w,126,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...",,0.0,,,,average,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,D4JtQNTI4X3KcbzacDJsMw,169,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...",,0.0,,,,average,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,HPA_qyMEddpAEtFof02ixg,39,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...",,1.0,,,,average,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,ufCxltuh56FF4-ZFZ6cVhg,135,"{'BusinessParking': '{'garage': False, 'street...",,,,,,quiet,0.0,...,0.0,,,,,,,,,
