In [2]:
import pyarrow as pa
import json
import numpy as np

In [3]:
with open("../../yelp-data/response.json") as file:
    file_str = file.read()

In [4]:
type(file_str)

str

In [5]:
res_json = json.loads(file_str)

In [6]:
type(res_json)

dict

In [7]:
res_json.keys()

dict_keys(['businesses', 'total', 'region'])

In [8]:
res_json['region']

{'center': {'longitude': -73.99429321289062, 'latitude': 40.70544486444615}}

In [9]:
business_list = res_json["businesses"]

In [10]:
len(business_list)

20

In [11]:
business_list[0].keys()

dict_keys(['id', 'alias', 'name', 'image_url', 'is_closed', 'url', 'review_count', 'categories', 'rating', 'coordinates', 'transactions', 'price', 'location', 'phone', 'display_phone', 'distance'])

In [12]:
business_list[0]

{'id': 'xAvwjM0F5LN9g3yFsk9e0w',
 'alias': 'starbucks-brooklyn-78',
 'name': 'Starbucks',
 'image_url': 'https://s3-media2.fl.yelpcdn.com/bphoto/UnOuT7MMfPf5FEXEmzrL4w/o.jpg',
 'is_closed': True,
 'url': 'https://www.yelp.com/biz/starbucks-brooklyn-78?adjust_creative=2FlPnf5brM1FB2UTY6vGnQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2FlPnf5brM1FB2UTY6vGnQ',
 'review_count': 10,
 'categories': [{'alias': 'coffee', 'title': 'Coffee & Tea'}],
 'rating': 3.5,
 'coordinates': {'latitude': 40.702818, 'longitude': -73.993666},
 'transactions': ['delivery'],
 'price': '$$',
 'location': {'address1': '11 Old Fulton St',
  'address2': '',
  'address3': None,
  'city': 'Brooklyn',
  'zip_code': '11201',
  'country': 'US',
  'state': 'NY',
  'display_address': ['11 Old Fulton St', 'Brooklyn, NY 11201']},
 'phone': '+19176857717',
 'display_phone': '(917) 685-7717',
 'distance': 296.84014710793184}

# Build a struct array

## schemas

In [13]:
business_type = pa.struct(
    [
        ('id', pa.utf8()),
        ('alias', pa.utf8()), 
        ('name', pa.utf8()), 
        ('image_url', pa.utf8()) ,
        ('is_closed',  pa.bool_()),
        ('url' , pa.utf8()),
        ('review_count', pa.int32()) ,
        ('categories', pa.list_(pa.struct(
            [
                ("alias",pa.utf8()),
                ("title",pa.utf8())
            ]
        ))),
        ('rating', pa.int8()), # losing precision
        ('coordinates', pa.struct(
            [
                ('latitude', pa.float32()),
                ('longitude', pa.float32())
            ]
        )),
        ('transactions',  pa.list_(pa.utf8())),
        ('price', pa.utf8()) ,
        ('location', pa.struct(
            [
                ("address1", pa.utf8()),
                ("address2", pa.utf8()),
                ("address3", pa.utf8()),
                ("city", pa.utf8()),
                ("zip_code", pa.utf8()),
                ("country", pa.utf8()),
                ("state", pa.utf8()),
                ("display_address", pa.list_(pa.utf8()))
            ]
        )),
        ('phone', pa.utf8())  ,
        ('display_phone', pa.utf8()) ,
        ('distance', pa.float32())
    ]
)

simple_business_type = pa.struct(
    [
        ('id', pa.utf8()),
        ('alias', pa.utf8()), 
        ('is_closed',  pa.bool_()),
        ('url' , pa.utf8()),
        ('review_count', pa.int32()) ,
        ('phone', pa.utf8())  ,
    ]
)

## Array instantiation

In [14]:
businesses = pa.array(
    business_list[:2],
    type=business_type
)

simple_businesses = pa.array(
    business_list[:3],
    type=simple_business_type
)

In [15]:
print(businesses.type)

struct<id: string, alias: string, name: string, image_url: string, is_closed: bool, url: string, review_count: int32, categories: list<item: struct<alias: string, title: string>>, rating: int8, coordinates: struct<latitude: float, longitude: float>, transactions: list<item: string>, price: string, location: struct<address1: string, address2: string, address3: string, city: string, zip_code: string, country: string, state: string, display_address: list<item: string>>, phone: string, display_phone: string, distance: float>


In [16]:
print(businesses)

-- is_valid: all not null
-- child 0 type: string
  [
    "xAvwjM0F5LN9g3yFsk9e0w",
    "qcnoyytlFIuqlcjDXkXJiw"
  ]
-- child 1 type: string
  [
    "starbucks-brooklyn-78",
    "starbucks-brooklyn-21"
  ]
-- child 2 type: string
  [
    "Starbucks",
    "Starbucks"
  ]
-- child 3 type: string
  [
    "https://s3-media2.fl.yelpcdn.com/bphoto/UnOuT7MMfPf5FEXEmzrL4w/o.jpg",
    "https://s3-media3.fl.yelpcdn.com/bphoto/lPZbt9TGzPnBhf-IA_oZQQ/o.jpg"
  ]
-- child 4 type: bool
  [
    true,
    false
  ]
-- child 5 type: string
  [
    "https://www.yelp.com/biz/starbucks-brooklyn-78?adjust_creative=2FlPnf5brM1FB2UTY6vGnQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2FlPnf5brM1FB2UTY6vGnQ",
    "https://www.yelp.com/biz/starbucks-brooklyn-21?adjust_creative=2FlPnf5brM1FB2UTY6vGnQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2FlPnf5brM1FB2UTY6vGnQ"
  ]
-- child 6 type: int32
  [
    10,
    67
  ]
-- child 7 type: list<item: struct<alias: strin

In [17]:
simple_businesses.type

StructType(struct<id: string, alias: string, is_closed: bool, url: string, review_count: int32, phone: string>)

In [18]:
simple_businesses

<pyarrow.lib.StructArray object at 0x7fe0907ff3a0>
-- is_valid: all not null
-- child 0 type: string
  [
    "xAvwjM0F5LN9g3yFsk9e0w",
    "qcnoyytlFIuqlcjDXkXJiw",
    "60agfQbky4cX8BEApyltIA"
  ]
-- child 1 type: string
  [
    "starbucks-brooklyn-78",
    "starbucks-brooklyn-21",
    "starbucks-new-york-524"
  ]
-- child 2 type: bool
  [
    true,
    false,
    false
  ]
-- child 3 type: string
  [
    "https://www.yelp.com/biz/starbucks-brooklyn-78?adjust_creative=2FlPnf5brM1FB2UTY6vGnQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2FlPnf5brM1FB2UTY6vGnQ",
    "https://www.yelp.com/biz/starbucks-brooklyn-21?adjust_creative=2FlPnf5brM1FB2UTY6vGnQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2FlPnf5brM1FB2UTY6vGnQ",
    "https://www.yelp.com/biz/starbucks-new-york-524?adjust_creative=2FlPnf5brM1FB2UTY6vGnQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2FlPnf5brM1FB2UTY6vGnQ"
  ]
-- child 4 type: int32
  [
    1

# Record Batches from Arrays

In [19]:
rb_sb = pa.RecordBatch.from_arrays(
    simple_businesses.flatten(),
    [
        'id',
        'alias',
        'is_closed',
        'url' ,
        'review_count',
        'phone',
    ]
)

In [20]:
rb_sb

pyarrow.RecordBatch
id: string
alias: string
is_closed: bool
url: string
review_count: int32
phone: string

In [21]:
rb_sb.num_rows, rb_sb.num_columns

(3, 6)

## Transforms

In [22]:
rb_sb_slice = rb_sb.slice(1,2)

In [23]:
rb_sb_slice.num_rows, rb_sb.column(2)[0], rb_sb_slice.column(2)[0]

(2, <pyarrow.BooleanScalar: True>, <pyarrow.BooleanScalar: False>)

In [24]:
print(rb_sb[1:2])

pyarrow.RecordBatch
id: string
alias: string
is_closed: bool
url: string
review_count: int32
phone: string


In [25]:
print(rb_sb.to_pydict())

{'id': ['xAvwjM0F5LN9g3yFsk9e0w', 'qcnoyytlFIuqlcjDXkXJiw', '60agfQbky4cX8BEApyltIA'], 'alias': ['starbucks-brooklyn-78', 'starbucks-brooklyn-21', 'starbucks-new-york-524'], 'is_closed': [True, False, False], 'url': ['https://www.yelp.com/biz/starbucks-brooklyn-78?adjust_creative=2FlPnf5brM1FB2UTY6vGnQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2FlPnf5brM1FB2UTY6vGnQ', 'https://www.yelp.com/biz/starbucks-brooklyn-21?adjust_creative=2FlPnf5brM1FB2UTY6vGnQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2FlPnf5brM1FB2UTY6vGnQ', 'https://www.yelp.com/biz/starbucks-new-york-524?adjust_creative=2FlPnf5brM1FB2UTY6vGnQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=2FlPnf5brM1FB2UTY6vGnQ'], 'review_count': [10, 67, 33], 'phone': ['+19176857717', '+17188550856', '+16465742921']}


# Ex

In [26]:
ex_business_type = pa.struct(
    [
        ('id', pa.utf8()),
        ('alias', pa.utf8()),
        ('transactions',  pa.list_(pa.utf8())),
    ]
)

In [27]:
array_ex = pa.array(
    business_list[:5],
    type=ex_business_type
)

In [28]:
rb_ex = pa.RecordBatch.from_arrays(
array_ex.flatten(),
[
    "id",
    "alias",
    "transactions"
]

) 

In [29]:
rb_ex.column(2)

<pyarrow.lib.ListArray object at 0x7fe0907ffd00>
[
  [
    "delivery"
  ],
  [
    "delivery"
  ],
  [
    "delivery"
  ],
  [],
  [
    "delivery"
  ]
]

In [30]:
# with array rather than RB
array_ex.to_pylist()

[{'id': 'xAvwjM0F5LN9g3yFsk9e0w',
  'alias': 'starbucks-brooklyn-78',
  'transactions': ['delivery']},
 {'id': 'qcnoyytlFIuqlcjDXkXJiw',
  'alias': 'starbucks-brooklyn-21',
  'transactions': ['delivery']},
 {'id': '60agfQbky4cX8BEApyltIA',
  'alias': 'starbucks-new-york-524',
  'transactions': ['delivery']},
 {'id': 'Y_kNzTJJUx9aFj1t_DVWkA',
  'alias': 'starbucks-surrey-6',
  'transactions': []},
 {'id': 'VGMdcAn34GeasB4arA8XUA',
  'alias': 'starbucks-new-york-217',
  'transactions': ['delivery']}]