In [1]:
import aframe as af
import pandas as pd
import findspark
findspark.init()
import pyspark
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.functions import *
import numpy as np

In [2]:
message_af = af.AFrame(dataverse='TinySocial', dataset='GleambookMessages')
user_af = af.AFrame(dataverse='TinySocial', dataset='GleambookUsers')
message_df = message_af.toPandas()
user_df = user_af.toPandas()

In [3]:
sc = SparkContext(appName="app")
sqlContext = SQLContext(sc)

In [4]:
message_spark = sqlContext.createDataFrame(message_df,message_df.columns.tolist())
user_spark = sqlContext.createDataFrame(user_df.fillna('None'),user_df.columns.tolist())

## Pandas DataFrame Functions

In [5]:
user_df.head()

Unnamed: 0,alias,employment,friendIds,gender,id,name,nickname,userSince
0,Margarita,"[{'organizationName': 'Codetechno', 'startDate...","[2, 3, 6, 10]",F,1,MargaritaStoddard,Mags,2012-08-20T10:10:00.000Z
1,Isbel,"[{'organizationName': 'Hexviafind', 'startDate...","[1, 4]",,2,IsbelDull,Izzy,2011-01-22T10:10:00.000Z
2,Emory,"[{'organizationName': 'geomedia', 'startDate':...","[1, 5, 8, 9]",,3,EmoryUnk,,2012-07-10T10:10:00.000Z
3,Nicholas,"[{'organizationName': 'Zamcorporation', 'start...",[2],,4,NicholasStroh,,2010-12-27T10:10:00.000Z
4,Von,"[{'organizationName': 'Kongreen', 'startDate':...","[3, 6, 10]",,5,VonKemble,,2010-01-05T10:10:00.000Z


### Elementwise functions

In [6]:
f = lambda x: len(str(x))
user_df['name'].map(f)

0    17
1     9
2     8
3    13
4     9
5    11
6    14
7    12
8    14
9     9
Name: name, dtype: int64

In [7]:
user_df['alias'].map(f)

0    9
1    5
2    5
3    8
4    3
5    6
6    7
7    4
8    7
9    4
Name: alias, dtype: int64

### Tablewise functions

In [8]:
user_df.apply(f)

alias         176
employment    581
friendIds     220
gender        117
id            103
name          255
nickname      139
userSince     330
dtype: int64

## AFrame Functions

### Elementwise Functions

In [9]:
name_len = user_af['name'].map('length')

In [10]:
name_len.query

'select value length(t.name) from TinySocial.GleambookUsers t;'

In [11]:
name_len.collect()

Unnamed: 0,0
0,17
1,9
2,8
3,13
4,9
5,11
6,14
7,12
8,14
9,9


### Functions with arguments

In [12]:
name_contain = user_af['name'].map('contains', 'Suzan')

In [13]:
name_contain.query

'select value contains(t.name, "Suzan") from TinySocial.GleambookUsers t;'

In [14]:
name_contain.collect()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,True
7,False
8,False
9,False


### Tablewise Functions

In [15]:
fields = message_af.apply('get_object_fields')

In [16]:
fields.query

'select value get_object_fields(t) from TinySocial.GleambookMessages t;'

In [17]:
fields.head(1)

Unnamed: 0,0,1,2,3,4
0,"{'field-name': 'messageId', 'field-type': 'big...","{'field-name': 'authorId', 'field-type': 'bigi...","{'field-name': 'inResponseTo', 'field-type': '...","{'field-name': 'senderLocation', 'field-type':...","{'field-name': 'message', 'field-type': 'strin..."


### PySpark

In [18]:
user_spark.select(length(user_spark['name'])).show()

+------------+
|length(name)|
+------------+
|          17|
|           9|
|           8|
|          13|
|           9|
|          11|
|          14|
|          12|
|          14|
|           9|
+------------+



In [20]:
user_spark.select(explode_outer('friendIds')).show()
type(user_spark)

+---+
|col|
+---+
|  2|
|  3|
|  6|
| 10|
|  1|
|  4|
|  1|
|  5|
|  8|
|  9|
|  2|
|  3|
|  6|
| 10|
|  1|
|  3|
|  7|
|  6|
|  3|
|  3|
+---+
only showing top 20 rows



pyspark.sql.dataframe.DataFrame