In [1]:
import aframe as af
import pandas as pd
import findspark
findspark.init()
import pyspark
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.functions import *

In [40]:
message_af = af.AFrame(dataverse='TinySocial', dataset='GleambookMessages')
user_af = af.AFrame(dataverse='TinySocial', dataset='GleambookUsers')
user2_af = af.AFrame(dataverse='TinySocial', dataset='GleambookUsers2')
message_df = message_af.toPandas()
user_df = user_af.toPandas()

In [44]:
user3_af = af.AFrame(dataverse='TinySocial', dataset='GleambookUsers3')
user3_af.head(2)

Unnamed: 0,bar,foo,id
0,,3.0,1
1,7.0,,2


In [3]:
sc = SparkContext(appName="af_app")
sqlContext = SQLContext(sc)

In [4]:
message_spark = sqlContext.createDataFrame(message_df,message_df.columns.tolist())
user_spark = sqlContext.createDataFrame(user_df.fillna('None'),user_df.columns.tolist())

## Pandas DataFrame JOIN

In [5]:
message_df.join(message_df,rsuffix='_r').head(2)

Unnamed: 0,authorId,inResponseTo,message,messageId,senderLocation,authorId_r,inResponseTo_r,message_r,messageId_r,senderLocation_r
0,3,2,love product-b its shortcut-menu is awesome:),1,"[47.16, 77.75]",3,2,love product-b its shortcut-menu is awesome:),1,"[47.16, 77.75]"
1,1,4,dislike x-phone its touch-screen is horrible,2,"[41.66, 80.87]",1,4,dislike x-phone its touch-screen is horrible,2,"[41.66, 80.87]"


## Pandas DataFrame Merge

In [6]:
user_df.merge(message_df, how='inner', left_on='id', right_on='authorId').head(2)

Unnamed: 0,alias,employment,friendIds,gender,id,name,nickname,userSince,authorId,inResponseTo,message,messageId,senderLocation
0,Margarita,"[{'organizationName': 'Codetechno', 'startDate...","[2, 3, 6, 10]",F,1,MargaritaStoddard,Mags,2012-08-20T10:10:00.000Z,1,4,dislike x-phone its touch-screen is horrible,2,"[41.66, 80.87]"
1,Margarita,"[{'organizationName': 'Codetechno', 'startDate...","[2, 3, 6, 10]",F,1,MargaritaStoddard,Mags,2012-08-20T10:10:00.000Z,1,2,can't stand acast the network is horrible:(,4,"[37.73, 97.04]"


In [28]:
message_df.merge(message_df, how='inner', left_on='authorId', right_on='authorId', suffixes=('_l','_r')).head(2)

Unnamed: 0,authorId,inResponseTo_l,message_l,messageId_l,senderLocation_l,inResponseTo_r,message_r,messageId_r,senderLocation_r
0,3,2,love product-b its shortcut-menu is awesome:),1,"[47.16, 77.75]",2,love product-b its shortcut-menu is awesome:),1,"[47.16, 77.75]"
1,3,2,love product-b its shortcut-menu is awesome:),1,"[47.16, 77.75]",12,love ccast its wireless is good,9,"[34.45, 96.48]"


## AFrame JOIN

In [8]:
joined_af = user_af.join(message_af, how='inner', left_on='id', right_on='authorId')

In [9]:
joined_af.query

'SELECT VALUE object_merge(l,r) FROM TinySocial.GleambookUsers l JOIN TinySocial.GleambookMessages r on l.id=r.authorId;'

In [10]:
joined_af = message_af.join(message_af, how='inner', left_on='authorId', right_on='authorId')

In [11]:
joined_af.query

'SELECT l,r from TinySocial.GleambookMessages l JOIN TinySocial.GleambookMessages r on l.authorId=r.authorId;'

In [12]:
joined_af.head(2)

Unnamed: 0,l,r
0,"{'messageId': 1, 'authorId': 3, 'inResponseTo'...","{'messageId': 1, 'authorId': 3, 'inResponseTo'..."
1,"{'messageId': 1, 'authorId': 3, 'inResponseTo'...","{'messageId': 9, 'authorId': 3, 'inResponseTo'..."


## Pandas vs. AFrame vs. PySpark

In [13]:
joined_pandas = pd.merge(user_df,message_df, left_on='id', right_on='authorId', how='inner')
joined_af = user_af.join(message_af, left_on='id', right_on='authorId',how='inner')
joined_spark = user_spark.join(message_spark, user_spark.id == message_spark.authorId, 'inner')

### Pandas

In [14]:
joined_pandas.head(2)

Unnamed: 0,alias,employment,friendIds,gender,id,name,nickname,userSince,authorId,inResponseTo,message,messageId,senderLocation
0,Margarita,"[{'organizationName': 'Codetechno', 'startDate...","[2, 3, 6, 10]",F,1,MargaritaStoddard,Mags,2012-08-20T10:10:00.000Z,1,4,dislike x-phone its touch-screen is horrible,2,"[41.66, 80.87]"
1,Margarita,"[{'organizationName': 'Codetechno', 'startDate...","[2, 3, 6, 10]",F,1,MargaritaStoddard,Mags,2012-08-20T10:10:00.000Z,1,2,can't stand acast the network is horrible:(,4,"[37.73, 97.04]"


### AFrame

In [15]:
joined_af.head(2)

Unnamed: 0,alias,authorId,employment,friendIds,gender,id,inResponseTo,message,messageId,name,nickname,senderLocation,userSince
0,Margarita,1,"[{'organizationName': 'Codetechno', 'startDate...","[2, 3, 6, 10]",F,1,4,dislike x-phone its touch-screen is horrible,2,MargaritaStoddard,Mags,"[41.66, 80.87]",2012-08-20T10:10:00.000Z
1,Margarita,1,"[{'organizationName': 'Codetechno', 'startDate...","[2, 3, 6, 10]",F,1,2,can't stand acast the network is horrible:(,4,MargaritaStoddard,Mags,"[37.73, 97.04]",2012-08-20T10:10:00.000Z


### PySpark

In [16]:
joined_spark.show(2)

+-------+--------------------+---------+------+---+--------------+--------+--------------------+--------+------------+--------------------+---------+--------------+
|  alias|          employment|friendIds|gender| id|          name|nickname|           userSince|authorId|inResponseTo|             message|messageId|senderLocation|
+-------+--------------------+---------+------+---+--------------+--------+--------------------+--------+------------+--------------------+---------+--------------+
|Suzanna|[[organizationNam...|      [6]|  None|  7|SuzannaTillson|    None|2012-08-07T10:10:...|       7|          11| like x-phone the...|       15|[44.47, 67.11]|
| Willis|[[organizationNam...|[1, 3, 7]|  None|  6|   WillisWynne|    None|2005-01-17T10:10:...|       6|           2| love product-b t...|        5| [34.7, 90.76]|
+-------+--------------------+---------+------+---+--------------+--------+--------------------+--------+------------+--------------------+---------+--------------+
only showi

In [33]:
joined_spark = message_spark.join(message_spark, message_spark.authorId == message_spark.authorId, 'inner')

In [36]:
joined_spark.show(2)

+--------+------------+--------------------+---------+--------------+--------+------------+--------------------+---------+--------------+
|authorId|inResponseTo|             message|messageId|senderLocation|authorId|inResponseTo|             message|messageId|senderLocation|
+--------+------------+--------------------+---------+--------------+--------+------------+--------------------+---------+--------------+
|       7|          11| like x-phone the...|       15|[44.47, 67.11]|       7|          11| like x-phone the...|       15|[44.47, 67.11]|
|       6|           2| love product-b t...|        5| [34.7, 90.76]|       6|           2| love product-b t...|        5| [34.7, 90.76]|
+--------+------------+--------------------+---------+--------------+--------+------------+--------------------+---------+--------------+
only showing top 2 rows



# Group By

## Pandas Group By

In [17]:
message_df.groupby('authorId')

<pandas.core.groupby.DataFrameGroupBy object at 0x109e329b0>

In [18]:
message_df.groupby('authorId').groups

{1: Int64Index([1, 3, 7, 9, 10], dtype='int64'),
 2: Int64Index([2, 5], dtype='int64'),
 3: Int64Index([0, 8], dtype='int64'),
 5: Int64Index([6], dtype='int64'),
 6: Int64Index([4], dtype='int64'),
 7: Int64Index([14], dtype='int64'),
 9: Int64Index([13], dtype='int64'),
 10: Int64Index([11, 12], dtype='int64')}

In [19]:
grouped_pandas = message_df.groupby('authorId')
grouped_pandas.get_group(1)

Unnamed: 0,authorId,inResponseTo,message,messageId,senderLocation
1,1,4,dislike x-phone its touch-screen is horrible,2,"[41.66, 80.87]"
3,1,2,can't stand acast the network is horrible:(,4,"[37.73, 97.04]"
7,1,11,like ccast the 3G is awesome:),8,"[40.33, 80.87]"
9,1,12,can't stand product-w the touch-screen is ter...,10,"[42.5, 70.01]"
10,1,1,can't stand acast its plan is terrible,11,"[38.97, 77.49]"


In [20]:
grouped_pandas.count()

Unnamed: 0_level_0,inResponseTo,message,messageId,senderLocation
authorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5,5,5,5
2,2,2,2,2
3,2,2,2,2
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
9,1,1,1,1
10,2,2,2,2


## PySpark Group By

In [21]:
grouped_spark = message_spark.groupby('authorId')
grouped_spark

<pyspark.sql.group.GroupedData at 0x109e70048>

In [22]:
grouped_spark.count().show()

+--------+-----+
|authorId|count|
+--------+-----+
|       7|    1|
|       6|    1|
|       9|    1|
|       5|    1|
|       1|    5|
|      10|    2|
|       3|    2|
|       2|    2|
+--------+-----+



## AFrame Group By

In [23]:
grouped_af = message_af.groupby('authorId')
grouped_af

<aframe.groupby.AFrameGroupBy at 0x109e70828>

In [24]:
grouped_af.query

'SELECT * FROM TinySocial.GleambookMessages t GROUP BY t.authorId AS grp_id GROUP AS grps(t AS grp);'

In [25]:
grouped_af.get_group(1)

Unnamed: 0,authorId,inResponseTo,message,messageId,senderLocation
0,1,4,dislike x-phone its touch-screen is horrible,2,"[41.66, 80.87]"
1,1,2,can't stand acast the network is horrible:(,4,"[37.73, 97.04]"
2,1,11,like ccast the 3G is awesome:),8,"[40.33, 80.87]"
3,1,12,can't stand product-w the touch-screen is ter...,10,"[42.5, 70.01]"
4,1,1,can't stand acast its plan is terrible,11,"[38.97, 77.49]"


In [26]:
grouped_af.count()

Unnamed: 0,authorId,count
0,1,5
1,2,2
2,3,2
3,5,1
4,6,1
5,7,1
6,9,1
7,10,2
