### Evaluate Spark context

In [1]:
sc

#### To make any python list to RDD 

In [3]:
simple_RDD = sc.parallelize([1, "Rizvi", 20.5])
simple_RDD

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:184

In [4]:
print(simple_RDD.count())
print(simple_RDD.first())
print(simple_RDD.take(2))
print(simple_RDD.collect())

3
1
[1, 'Rizvi']
[1, 'Rizvi', 20.5]


#### To make any RDD to DataFrame 

In [None]:
# You can NOT convert simple_RDD to DataFrame as this is a one-dymentional array and has no associated schema.
simple_RDD.toDF()

In [6]:
mareix_rdd = sc.parallelize([[1,"Riz", 5.6],[2,"Rid", 10.6]])
mareix_rdd.collect()

[[1, 'Riz', 5.6], [2, 'Rid', 10.6]]

In [7]:
mareix_df = mareix_rdd.toDF()
mareix_df

DataFrame[_1: bigint, _2: string, _3: double]

#### Introducing Row object for DataFrame

In [8]:
from pyspark.sql.types import Row
from datetime import datetime

In [9]:
mareix_rdd = sc.parallelize([Row(Nr=1,Name="Riz",Score=5.6),
                             Row(Nr=2,Name="Rid",Score=10.6)])
mareix_rdd.collect()

[Row(Name='Riz', Nr=1, Score=5.6), Row(Name='Rid', Nr=2, Score=10.6)]

In [10]:
mareix_df = mareix_rdd.toDF()

In [11]:
mareix_df.show()

+----+---+-----+
|Name| Nr|Score|
+----+---+-----+
| Riz|  1|  5.6|
| Rid|  2| 10.6|
+----+---+-----+



#### DataFrame with complex data type

In [12]:
complex_rdd = sc.parallelize([Row(col_list=[1,4,8],
                                  col_dict={"Riz":300, "Rid":600},
                                  col_row=Row(a=5.6, b=4.7, c="beta"),
                                  col_time=datetime(2015,12,31,12,59,59)
                                 ),
                             Row(col_list=[1,4,8],
                                  col_dict={"Riz":300, "Rid":700},
                                  col_row=Row(a=5.6, b=4.7, c="alpha"),
                                  col_time=datetime(2017,12,31,12,59,59))
                             ])

In [13]:
complex_df = complex_rdd.toDF()
complex_df.show()

+--------------------+---------+-----------------+-------------------+
|            col_dict| col_list|          col_row|           col_time|
+--------------------+---------+-----------------+-------------------+
|[Rid -> 600, Riz ...|[1, 4, 8]| [5.6, 4.7, beta]|2015-12-31 12:59:59|
|[Rid -> 700, Riz ...|[1, 4, 8]|[5.6, 4.7, alpha]|2017-12-31 12:59:59|
+--------------------+---------+-----------------+-------------------+



### SQL context

In [14]:
sql_cx = SQLContext(sc) 

In [15]:
df = sql_cx.range(5)

In [16]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



#### Create dataframe with toupls 

In [17]:
df = sql_cx.createDataFrame([("Riz",200),("Rid", 500)])
df.show()

+---+---+
| _1| _2|
+---+---+
|Riz|200|
|Rid|500|
+---+---+



#### Adding columns


In [18]:
df = sql_cx.createDataFrame([("Riz",200),("Rid", 500)], ["Names", "Scores"])
df.show()

+-----+------+
|Names|Scores|
+-----+------+
|  Riz|   200|
|  Rid|   500|
+-----+------+



### Printing entire RDD
###### If attempting to print out the elements of an RDD using rdd.foreach(println) or rdd.map(println) 
- On a single machine, this will generate the expected output and print all the RDD’s elements. 
- However, in cluster mode, the output to stdout being called by the executors is now writing to the executor’s stdout instead, not the one on the driver, so stdout on the driver won’t show these! 
- To print all elements on the driver, one can use the collect() method to first bring the RDD to the driver node thus: rdd.collect().foreach(println). This can cause the driver to run out of memory, though, because collect() fetches the entire RDD to a single machine; if you only need to print a few elements of the RDD, a safer approach is to use the take(): rdd.take(100).foreach(println).

In [19]:
data_rdd = sc.parallelize([Row(1,"Riz"),
                           Row(2,"Rid")])
data_rdd.count()

2

In [53]:
# This print will fail : Row objects dont have any column defines
print(data_rdd.collect())

TypeError: sequence item 0: expected str instance, int found

#### Define colulmn with lambda and map() function


In [55]:
column_names = Row("ID", "NAME")
posts = data_rdd.map(lambda r: column_names(*r))
posts

PythonRDD[107] at RDD at PythonRDD.scala:49

In [56]:
print(posts.collect())

[Row(ID=1, NAME='Riz'), Row(ID=2, NAME='Rid')]


In [58]:
stark_df = sql_cx.createDataFrame(posts)
stark_df.show()

+---+----+
| ID|NAME|
+---+----+
|  1| Riz|
|  2| Rid|
+---+----+



#### Interchangable between Spark DataFrame and Pandas DataFrame

In [59]:
import pandas as pd

In [60]:
pandas_df = stark_df.toPandas()
pandas_df

Unnamed: 0,ID,NAME
0,1,Riz
1,2,Rid


In [61]:
sql_cx.createDataFrame(pandas_df).show()


+---+----+
| ID|NAME|
+---+----+
|  1| Riz|
|  2| Rid|
+---+----+

