In [1]:
import pandas as pd
import findspark
findspark.init()
import pyspark
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.functions import *

In [2]:
sc = SparkContext(appName="app")

In [3]:
sqlContext = SQLContext(sc)

In [4]:
data = [('Amy',25),('Joe',30),('John',22),('Jane',20),('Tim',26) ]

In [5]:
rdd = sc.parallelize(data)

In [6]:
pySparkDF = sqlContext.createDataFrame(rdd, ['name', 'age'])

In [25]:
pandasDF = pd.DataFrame(data, columns=['name', 'age'])

In [30]:
spark_df1 = pySparkDF.alias("spark_df1") 
spark_df2 = pySparkDF.alias("spark_df2")
joined_df = spark_df1.join(spark_df2, spark_df1.name == spark_df2.name, 'inner')

In [12]:
joined_df.show()

+----+---+----+---+
|name|age|name|age|
+----+---+----+---+
| Joe| 30| Joe| 30|
| Amy| 25| Amy| 25|
|John| 22|John| 22|
| Tim| 26| Tim| 26|
|Jane| 20|Jane| 20|
+----+---+----+---+



In [None]:
joined_df = spark_df1.join(spark_df2, spark_df1.name == spark_df2.name, 'inner')

In [13]:
pandas_df1 = pandasDF
pandas_df2 = pandasDF
pandas_df1.join(pandas_df2, rsuffix="_df2")

Unnamed: 0,name,age,name_df2,age_df2
0,Amy,25,Amy,25
1,Joe,30,Joe,30
2,John,22,John,22
3,Jane,20,Jane,20
4,Tim,26,Tim,26


In [14]:
spark_df1.collect()[:3]

[Row(name='Amy', age=25), Row(name='Joe', age=30), Row(name='John', age=22)]

In [15]:
pandas_df1[:2]

Unnamed: 0,name,age
0,Amy,25
1,Joe,30


In [16]:
df_3 = spark_df1.union(spark_df2)
df_3.show()

+----+---+
|name|age|
+----+---+
| Amy| 25|
| Joe| 30|
|John| 22|
|Jane| 20|
| Tim| 26|
| Amy| 25|
| Joe| 30|
|John| 22|
|Jane| 20|
| Tim| 26|
+----+---+



In [17]:
pd.concat([pandas_df1, pandas_df2])

Unnamed: 0,name,age
0,Amy,25
1,Joe,30
2,John,22
3,Jane,20
4,Tim,26
0,Amy,25
1,Joe,30
2,John,22
3,Jane,20
4,Tim,26


In [18]:
spark_df1.head(2)

[Row(name='Amy', age=25), Row(name='Joe', age=30)]

In [19]:
nested_data = [('Amy',25, ['Joe', 'Jane']),('Joe',30, ['Amy']),('John',22, ['Tim']),
               ('Jane',20, ['Amy', 'Tim']),('Tim',26, ['Jane', 'John']) ]


In [20]:
nested_rdd = sc.parallelize(nested_data)
spark_nested_df = sqlContext.createDataFrame(nested_rdd, ['name', 'age', 'friends'])

In [21]:
spark_nested_df.show()

+----+---+------------+
|name|age|     friends|
+----+---+------------+
| Amy| 25| [Joe, Jane]|
| Joe| 30|       [Amy]|
|John| 22|       [Tim]|
|Jane| 20|  [Amy, Tim]|
| Tim| 26|[Jane, John]|
+----+---+------------+



In [22]:
explodeDF = spark_nested_df.withColumn("friend",explode("friends"))
explodeDF.show()
explodeDF.drop("friends").show()

+----+---+------------+------+
|name|age|     friends|friend|
+----+---+------------+------+
| Amy| 25| [Joe, Jane]|   Joe|
| Amy| 25| [Joe, Jane]|  Jane|
| Joe| 30|       [Amy]|   Amy|
|John| 22|       [Tim]|   Tim|
|Jane| 20|  [Amy, Tim]|   Amy|
|Jane| 20|  [Amy, Tim]|   Tim|
| Tim| 26|[Jane, John]|  Jane|
| Tim| 26|[Jane, John]|  John|
+----+---+------------+------+

+----+---+------+
|name|age|friend|
+----+---+------+
| Amy| 25|   Joe|
| Amy| 25|  Jane|
| Joe| 30|   Amy|
|John| 22|   Tim|
|Jane| 20|   Amy|
|Jane| 20|   Tim|
| Tim| 26|  Jane|
| Tim| 26|  John|
+----+---+------+



In [23]:
spark_df1.groupby('age')

<pyspark.sql.group.GroupedData at 0x111e35dd8>

In [24]:
spark_df1.groupby('age').count().show()

+---+-----+
|age|count|
+---+-----+
| 26|    1|
| 22|    1|
| 25|    1|
| 30|    1|
| 20|    1|
+---+-----+

