In [3]:
sc

#### Topics Covered

* **sc: Spark Context**
    * parallelize
    * sqlContext=SQLContext(sc)
* **rdd: resilient distributed datasets**
    * count
    * first
    * take
    * collect
    * toDF
    * map
* **df: DataFrames**
    * show
    * collect
    * rdd
    * select
    * withColumn
    * withColumnRenamed
    * toPandas
* **sqlContext**
    * range
    * createDataFrame

In [4]:
#Python List to RDD
pyRDD = sc.parallelize([1,'One',True])
pyRDD

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [5]:
#Actions on RDD
print(pyRDD.count(),pyRDD.first(),pyRDD.take(2),pyRDD.collect())

3 1 [1, 'One'] [1, 'One', True]


In [None]:
pyDF=pyRDD.toDF() #TypeError: Can not infer schema 

In [6]:
#RDD made up of structured data can be converted to Data Frames
df = sc.parallelize([[0,'Zero',False],[1,'One',True]]).toDF()
df

DataFrame[_1: bigint, _2: string, _3: boolean]

In [7]:
df.show()

+---+----+-----+
| _1|  _2|   _3|
+---+----+-----+
|  0|Zero|false|
|  1| One| true|
+---+----+-----+



In [8]:
from pyspark.sql import Row
df = sc.parallelize([Row(i=0,j='Zero',k=False),Row(i=1,j='One',k=True)]).toDF()
df.show()

+---+----+-----+
|  i|   j|    k|
+---+----+-----+
|  0|Zero|false|
|  1| One| true|
+---+----+-----+



In [9]:
sqlContext=SQLContext(sc)
sqlContext

<pyspark.sql.context.SQLContext at 0x7f5b87f9eef0>

In [10]:
df=sqlContext.range(5)
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [11]:
#SqlContext to create data from array
data=[('Sachin',90),('Ganguly',100),('Dravid',80)]
df=sqlContext.createDataFrame(data,['Name','Score'])
df.show()

+-------+-----+
|   Name|Score|
+-------+-----+
| Sachin|   90|
|Ganguly|  100|
| Dravid|   80|
+-------+-----+



In [12]:
#SqlContext to create data from RDD
rdd= sc.parallelize([Row(i=0,j='Zero',k=False,l=[1,3,5,7,9]),Row(i=1,j='One',k=True,l=[2,4,6,8])])
df=sqlContext.createDataFrame(rdd)
df.show()

+---+----+-----+---------------+
|  i|   j|    k|              l|
+---+----+-----+---------------+
|  0|Zero|false|[1, 3, 5, 7, 9]|
|  1| One| true|   [2, 4, 6, 8]|
+---+----+-----+---------------+



In [13]:
list_cell=df.collect()[0][3]
list_cell

[1, 3, 5, 7, 9]

In [14]:
list_cell.append(11)
list_cell

[1, 3, 5, 7, 9, 11]

In [15]:
df.show() # The original cell in the data frame remains unchanged, a copy is returned

+---+----+-----+---------------+
|  i|   j|    k|              l|
+---+----+-----+---------------+
|  0|Zero|false|[1, 3, 5, 7, 9]|
|  1| One| true|   [2, 4, 6, 8]|
+---+----+-----+---------------+



In [16]:
#RDD equivalent of the dataframe
df.rdd.collect()

[Row(i=0, j='Zero', k=False, l=[1, 3, 5, 7, 9]),
 Row(i=1, j='One', k=True, l=[2, 4, 6, 8])]

In [17]:
#Select Specific columns from the RDD
df.rdd.map(lambda x:(x.j,x.k)).collect()

[('Zero', False), ('One', True)]

In [18]:
#Select Specific columns from the DataFrame
df.select('i','l').show()

+---+---------------+
|  i|              l|
+---+---------------+
|  0|[1, 3, 5, 7, 9]|
|  1|   [2, 4, 6, 8]|
+---+---------------+



In [19]:
#Operate on specific column RDD
df.rdd.map(lambda x:(x.l*3)).collect()

[[1, 3, 5, 7, 9, 1, 3, 5, 7, 9, 1, 3, 5, 7, 9],
 [2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8]]

In [20]:
#Operate on specific column DataFrame & renaming DataFrame column
df.select('k').withColumn("l",df.k==False).withColumnRenamed("l","toggle").show()

+-----+------+
|    k|toggle|
+-----+------+
|false|  true|
| true| false|
+-----+------+



In [21]:
#Column Rename
df.select(df.k.alias("Booleam")).show()

+-------+
|Booleam|
+-------+
|  false|
|   true|
+-------+



In [22]:
import pandas
#Spark DataFrames are built on top of RDDs and distributed accross multiple nodes in a spark cluster,
#Pandas dataframe will be in memory on a single machine.

In [23]:
pandas_df=df.toPandas()
pandas_df

Unnamed: 0,i,j,k,l
0,0,Zero,False,"[1, 3, 5, 7, 9]"
1,1,One,True,"[2, 4, 6, 8]"


In [24]:
spark_df=sqlContext.createDataFrame(pandas_df)
spark_df.show()

+---+----+-----+---------------+
|  i|   j|    k|              l|
+---+----+-----+---------------+
|  0|Zero|false|[1, 3, 5, 7, 9]|
|  1| One| true|   [2, 4, 6, 8]|
+---+----+-----+---------------+

