In [1]:
from aframe import AFrame
import pandas as pd
from aframe.connector import CypherConnector
from aframe.connector import AsterixConnector
from aframe.connector import MongoConnector

## PolyFrame + AsterixDB

In [2]:
af = AFrame(dataverse='Yelp', 
            dataset='businesses', 
            connector=AsterixConnector('localhost:19002'))

In [3]:
af.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,--1UhMGODdWsrMastO9DZw,The Spicy Amigos,821 4 Avenue SW,Calgary,AB,T2P 0K5,51.049673,-114.079977,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, Mexican","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'..."
1,--6MefnULPED_I942VcFNA,John's Chinese BBQ Restaurant,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,ON,L4B 3P7,43.841694,-79.399755,3.0,44,1,"{'BusinessParking': '{'garage': False, 'street...","Chinese, Restaurants","{'Monday': '11:0-22:30', 'Tuesday': '11:0-22:3..."
2,--7zmmkVg-IMGaXbuVd0SQ,Primal Brewery,16432 Old Statesville Rd,Huntersville,NC,28078,35.437106,-80.843688,4.0,58,1,"{'BikeParking': 'True', 'RestaurantsPriceRange...","Breweries, Food","{'Monday': '16:0-22:0', 'Tuesday': '16:0-22:0'..."
3,--8LPVSo5i0Oo61X01sV9A,Valley Bone and Joint Specialists,"3941 E Baseline Rd, Ste 102",Gilbert,AZ,85234,33.378589,-111.748145,3.5,4,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Orthopedists, Doctors, Sports Medicine, Weight...","{'Monday': '8:30-16:30', 'Tuesday': '8:30-16:3..."
4,--9QQLMTbFzLJ_oT-ON3Xw,Great Clips,"1835 E Guadalupe Rd, Ste 106",Tempe,AZ,85283,33.362813,-111.908995,3.5,12,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Beauty & Spas, Hair Salons",


## Underlying query

In [4]:
af.head(query=True)

'SELECT VALUE t FROM Yelp.businesses t LIMIT 5'

## Projection

In [5]:
projections = af[['categories', 'state', 'stars', 'review_count', 'city']]
projections.head()

Unnamed: 0,categories,state,stars,review_count,city
0,"Restaurants, Mexican",AB,4.0,24,Calgary
1,"Chinese, Restaurants",ON,3.0,44,Richmond Hill
2,"Breweries, Food",NC,4.0,58,Huntersville
3,"Orthopedists, Doctors, Sports Medicine, Weight...",AZ,3.5,4,Gilbert
4,"Beauty & Spas, Hair Salons",AZ,3.5,12,Tempe


In [6]:
projections.query

'SELECT categories, state, stars, review_count, city FROM (SELECT VALUE t FROM Yelp.businesses t) t'

## Selection

In [7]:
selections = projections[(projections['categories'] == 'Mexican, Restaurants') 
                         & (projections['stars'] > 3.0)]

In [8]:
selections.head()

Unnamed: 0,categories,state,stars,review_count,city
0,"Mexican, Restaurants",SC,4.0,16,Fort Mill
1,"Mexican, Restaurants",AZ,4.0,29,Gilbert
2,"Mexican, Restaurants",NV,4.5,9,Las Vegas
3,"Mexican, Restaurants",QC,4.0,6,Montréal
4,"Mexican, Restaurants",NC,4.0,101,Charlotte


In [9]:
selections.query

'SELECT VALUE t FROM (SELECT categories, state, stars, review_count, city FROM (SELECT VALUE t FROM Yelp.businesses t) t) t WHERE categories = "Mexican, Restaurants" AND stars > 3.0'

In [10]:
groups = selections.groupby('state').agg({'review_count':'sum'})
groups.head(15)

Unnamed: 0,state,sum_review_count
0,AB,119
1,AZ,20841
2,IL,735
3,NC,3266
4,NV,8298
5,OH,1907
6,ON,2194
7,PA,985
8,QC,372
9,SC,228


# Retarget PolyFrame onto MongoDB and Neo4j

### Neo4j connector

In [11]:
neo4j_connector = CypherConnector(uri='http://localhost:7474', username='neo4j', password='password')

### MongoDB connector

In [12]:
mongo_connector = MongoConnector('mongodb://localhost:27017')

## Dataframe construction

In [13]:
neo4j_df = AFrame(dataverse='Yelp', dataset='business', connector=neo4j_connector)

In [14]:
mongo_df = AFrame(dataverse='Yelp', dataset='business', connector=mongo_connector)

In [15]:
print('Neo4j Dataframe:\n')
neo4j_df.head(2)

Neo4j Dataframe:



Unnamed: 0,address,city,is_open,latitude,name,review_count,state,categories,stars,postal_code,business_id,longitude
0,5931 Providence Rd,Charlotte,0,35.131469,The Reserve at Providence by Pegasus Residential,6,NC,"Apartments, Real Estate, Home Services",1.0,28226,0SbElPykzndWQufOPSPDsg,-80.780094
1,"1835 E Guadalupe Rd, Ste 106",Tempe,1,33.362813,Great Clips,12,AZ,"Beauty & Spas, Hair Salons",3.5,85283,--9QQLMTbFzLJ_oT-ON3Xw,-111.908995


In [16]:
print('MongoDB Dataframe:\n')
mongo_df.head(2)

MongoDB Dataframe:



Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,--9QQLMTbFzLJ_oT-ON3Xw,Great Clips,"1835 E Guadalupe Rd, Ste 106",Tempe,AZ,85283,33.362813,-111.908995,3.5,12,1,"Beauty & Spas, Hair Salons"
1,--ttCFj_csKJhxnaMRNuiw,Impressions Dental,"5970 S Cooper Rd, Ste 1",Chandler,AZ,85249,33.219474,-111.807451,2.5,39,1,"General Dentistry, Health & Medical, Dentists,..."


In [17]:
print('Neo4j\n{}\n'.format(neo4j_df.query))
print('MongoDB\n{}'.format(mongo_df.query))

Neo4j
MATCH(t: business)

MongoDB
{ "$match": {} }


### Projection

In [18]:
neo4j_df = neo4j_df[['categories', 'state', 'stars', 'review_count', 'city']]
neo4j_df.head(2)

Unnamed: 0,city,review_count,stars,state,categories
0,Charlotte,6,1.0,NC,"Apartments, Real Estate, Home Services"
1,Tempe,12,3.5,AZ,"Beauty & Spas, Hair Salons"


In [19]:
mongo_df = mongo_df[['categories', 'state', 'stars', 'review_count', 'city']]
mongo_df.head(2)

Unnamed: 0,city,state,stars,review_count,categories
0,Tempe,AZ,3.5,12,"Beauty & Spas, Hair Salons"
1,Chandler,AZ,2.5,39,"General Dentistry, Health & Medical, Dentists,..."


In [20]:
print(neo4j_df.query)
print('\n')
print(mongo_df.query)

MATCH(t: business)
WITH t{`categories`:t.categories, `state`:t.state, `stars`:t.stars, `review_count`:t.review_count, `city`:t.city}


{ "$match": {} },
{ "$project": { "categories": 1, "state": 1, "stars": 1, "review_count": 1, "city": 1 } }


### Selection

In [21]:
neo4j_df = neo4j_df[(neo4j_df['categories'] == 'Mexican, Restaurants') & 
                    (neo4j_df['stars'] > 3.0)]
neo4j_df.head(2)

Unnamed: 0,city,review_count,stars,state,categories
0,Phoenix,44,3.5,AZ,"Mexican, Restaurants"
1,Sun City,56,4.0,AZ,"Mexican, Restaurants"


In [22]:
mongo_df = mongo_df[(mongo_df['categories'] == 'Mexican, Restaurants') & 
                    (mongo_df['stars'] > 3.0)]
mongo_df.head(2)

Unnamed: 0,city,state,stars,review_count,categories
0,Phoenix,AZ,3.5,44,"Mexican, Restaurants"
1,Sun City,AZ,4.0,56,"Mexican, Restaurants"


In [23]:
print(neo4j_df.query)
print('\n')
print(mongo_df.query)

MATCH(t: business)
WITH t{`categories`:t.categories, `state`:t.state, `stars`:t.stars, `review_count`:t.review_count, `city`:t.city}
WITH t WHERE t.categories = "Mexican, Restaurants" AND t.stars > 3.0


{ "$match": {} },
{ "$project": { "categories": 1, "state": 1, "stars": 1, "review_count": 1, "city": 1 } },
{ "$match": { "$expr": { "$and": [ { "$eq": ["$categories", "Mexican, Restaurants"] }, { "$gt": ["$stars", 3.0] } ] } } }


### Grouping

In [24]:
neo_grps = neo4j_df.groupby('state').agg({'review_count':'sum'})
neo_grps.head()

Unnamed: 0,sum_review_count,state
0,20841,AZ
1,8298,NV
2,2194,ON
3,1907,OH
4,735,IL


In [25]:
mongo_grps = mongo_df.groupby('state').agg({'review_count':'sum'})
mongo_grps.head()

Unnamed: 0,sum_review_count,state
0,1907,OH
1,1130,WI
2,372,QC
3,8298,NV
4,3266,NC


### Sorting

In [26]:
neo_sorted = neo_grps.sort_values(by='sum_review_count', ascending=False)
neo_sorted.head()

Unnamed: 0,sum_review_count,state
0,20841,AZ
1,8298,NV
2,3266,NC
3,2194,ON
4,1907,OH


In [27]:
mongo_sorted = mongo_grps.sort_values(by='sum_review_count', ascending=False)
mongo_sorted.head()

Unnamed: 0,sum_review_count,state
0,20841,AZ
1,8298,NV
2,3266,NC
3,2194,ON
4,1907,OH


In [28]:
print('Neo4j\n{}\n'.format(neo_sorted.head(query=True)))
print('MongoDB\n{}'.format(mongo_sorted.head(query=True)))

Neo4j
MATCH(t: business)
WITH t{`categories`:t.categories, `state`:t.state, `stars`:t.stars, `review_count`:t.review_count, `city`:t.city}
WITH t WHERE t.categories = "Mexican, Restaurants" AND t.stars > 3.0
WITH {`state`: t.state, `sum_review_count`: sum(t.review_count)} AS t
WITH t ORDER BY t.sum_review_count DESC
RETURN t
LIMIT 5

MongoDB
{ "$match": {} },
{ "$project": { "categories": 1, "state": 1, "stars": 1, "review_count": 1, "city": 1 } },
{ "$match": { "$expr": { "$and": [ { "$eq": ["$categories", "Mexican, Restaurants"] }, { "$gt": ["$stars", 3.0] } ] } } },
{ "$group": { "_id": { "state": "$state" }, "sum_review_count": { "$sum": "$review_count" } } },
{ "$addFields": { "state": "$_id.state" } },
{ "$sort": { "sum_review_count": -1 } },
{ "$project": { "_id": 0 } },
{ "$limit" : 5 }


## Composite Operations

In [29]:
neo4j_df = AFrame(dataverse='Yelp', dataset='business', connector=neo4j_connector)
mongo_df = AFrame(dataverse='Yelp', dataset='business', connector=mongo_connector)

In [30]:
neo4j_df[['review_count', 'stars', 'latitude', 'longitude']].describe()

Unnamed: 0,review_count,stars,latitude,longitude
avg,33.538962,3.585627,38.541803,-97.594785
std,110.134939,1.018456,4.941951,16.697681
min,3.0,1.0,33.204642,-115.493471
max,8348.0,5.0,51.299943,-72.911982
count,192609.0,192609.0,192609.0,192609.0


In [31]:
mongo_df[['review_count', 'stars', 'latitude', 'longitude']].describe()

Unnamed: 0,review_count,stars,latitude,longitude
avg,33.538962,3.585627,38.541803,-97.594785
std,110.135224,1.018458,4.941964,16.697725
min,3.0,1.0,33.204642,-115.493471
max,8348.0,5.0,51.299943,-72.911982
count,192609.0,192609.0,192609.0,192609.0


In [32]:
print(neo4j_df[['review_count', 'stars', 'latitude', 'longitude']].describe(query=True))
print('\n')
print(mongo_df[['review_count', 'stars', 'latitude', 'longitude']].describe(query=True))

MATCH(t: business)
WITH t{`review_count`:t.review_count, `stars`:t.stars, `latitude`:t.latitude, `longitude`:t.longitude}
WITH {`avg_review_count`: avg(t.review_count), `std_review_count`: stDevP(t.review_count), `min_review_count`: min(t.review_count), `max_review_count`: max(t.review_count), `count_review_count`: count(t.review_count), `avg_stars`: avg(t.stars), `std_stars`: stDevP(t.stars), `min_stars`: min(t.stars), `max_stars`: max(t.stars), `count_stars`: count(t.stars), `avg_latitude`: avg(t.latitude), `std_latitude`: stDevP(t.latitude), `min_latitude`: min(t.latitude), `max_latitude`: max(t.latitude), `count_latitude`: count(t.latitude), `avg_longitude`: avg(t.longitude), `std_longitude`: stDevP(t.longitude), `min_longitude`: min(t.longitude), `max_longitude`: max(t.longitude), `count_longitude`: count(t.longitude)} AS t
RETURN t


{ "$match": {} },
{ "$project": { "review_count": 1, "stars": 1, "latitude": 1, "longitude": 1 } },
{ "$group": { "_id": {}, "avg_review_count"