In [10]:
sc

In [11]:
spark

In [12]:
# Converting any python list to RDD
simple_data = sc.parallelize([1, 'Alice', 50])
simple_data

ParallelCollectionRDD[8] at parallelize at PythonRDD.scala:194

In [13]:
simple_data.collect()

[1, 'Alice', 50]

In [14]:
simple_data.first()

1

In [15]:
simple_data.take(2)

[1, 'Alice']

In [16]:
from pyspark.sql.types import Row
from datetime import datetime

In [17]:
#convert an RDD to data frame
df = simple_data.toDF() 
# this throws an error, as RDD doesnt have any schema defined, so throws an error

TypeError: Can not infer schema for type: <class 'int'>

In [18]:
records = sc.parallelize([[1, 'Alice', 50],[2, 'Bob', 100]])
records

ParallelCollectionRDD[16] at parallelize at PythonRDD.scala:194

In [19]:
records.take(2)

[[1, 'Alice', 50], [2, 'Bob', 100]]

In [20]:
records

ParallelCollectionRDD[16] at parallelize at PythonRDD.scala:194

In [21]:
records.count()

2

In [22]:
# when data frame is created, based on the items in the list, types are inferred.
records_data_Frame = records.toDF()

In [23]:
records_data_Frame

DataFrame[_1: bigint, _2: string, _3: bigint]

In [24]:
# columns are generated and added to the list
records_data_Frame.show()

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob|100|
+---+-----+---+



In [25]:
#create RDD with names mentioned 
data = sc.parallelize([Row(id=1, name='Alice', score=50), 
                       Row(id=2, name='Bob', score=40),
                       Row(id=3, name='Charlee', score=75)
                      ])
data.take(2)
data.collect()

[Row(id=1, name='Alice', score=50),
 Row(id=2, name='Bob', score=40),
 Row(id=3, name='Charlee', score=75)]

In [26]:
records = data.toDF()

In [27]:
records.show()

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  2|    Bob|   40|
|  3|Charlee|   75|
+---+-------+-----+



In [28]:
# create complex data with complex types
# Row has list, dictionary, date time objects
complex_data = sc.parallelize([
    Row(col_list=[1,2,3,4], col_dict={"k1": 0}, col_row= Row(a=10, b=20, c=30), col_time=datetime(2019,3,2,14,1,5)),
    Row(col_list=[1,2,3,4,5], col_dict={"k1": 0, "k2" : 1}, col_row= Row(a=10, b=20, c=30), col_time=datetime(2019,3,2,14,1,5)),
    Row(col_list=[1,2,3,4,5], col_dict={"k1": 0, "k2" : 1, "k3": 2}, col_row= Row(a=10, b=20, c=30), col_time=datetime(2019,3,2,14,1,5))
]
)
df = complex_data.toDF()
df.show()

+--------------------+---------------+------------+-------------------+
|            col_dict|       col_list|     col_row|           col_time|
+--------------------+---------------+------------+-------------------+
|           [k1 -> 0]|   [1, 2, 3, 4]|[10, 20, 30]|2019-03-02 14:01:05|
|  [k1 -> 0, k2 -> 1]|[1, 2, 3, 4, 5]|[10, 20, 30]|2019-03-02 14:01:05|
|[k3 -> 2, k1 -> 0...|[1, 2, 3, 4, 5]|[10, 20, 30]|2019-03-02 14:01:05|
+--------------------+---------------+------------+-------------------+



In [29]:
# create sql context
sqlContext = SQLContext(sc)
sqlContext

<pyspark.sql.context.SQLContext at 0x2e97099ac18>

In [30]:
df = sqlContext.range(5)
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [31]:
#create a data frame
data = [['Alice', 40],
        ['Bob', 50],
        ['Charlie', 60]
       ]

In [32]:
#create a data frame with tuples and give names to the columns
sqlContext.createDataFrame(data,['Name', 'Score']).show()

+-------+-----+
|   Name|Score|
+-------+-----+
|  Alice|   40|
|    Bob|   50|
|Charlie|   60|
+-------+-----+



In [41]:
#create complex data frame from complex data
complex_data_for_sql = [
    Row(col_list=[1,2,3,4], col_dict={"k1": 0}, col_row= Row(a=10, b=20, c=30), col_time=datetime(2019,3,2,14,1,5)),
    Row(col_list=[4,5,3,4,5], col_dict={"k1": 0, "k2" : 1}, col_row= Row(a=10, b=20, c=30), col_time=datetime(2019,3,2,14,1,5)),
    Row(col_list=[7,8,3,4,5], col_dict={"k1": 0, "k2" : 1, "k3": 2}, col_row= Row(a=10, b=20, c=30), col_time=datetime(2019,3,2,14,1,5))
]
complex_dataframe = sqlContext.createDataFrame(complex_data_for_sql)
complex_dataframe.show()

+--------------------+---------------+------------+-------------------+
|            col_dict|       col_list|     col_row|           col_time|
+--------------------+---------------+------------+-------------------+
|           [k1 -> 0]|   [1, 2, 3, 4]|[10, 20, 30]|2019-03-02 14:01:05|
|  [k1 -> 0, k2 -> 1]|[4, 5, 3, 4, 5]|[10, 20, 30]|2019-03-02 14:01:05|
|[k3 -> 2, k1 -> 0...|[7, 8, 3, 4, 5]|[10, 20, 30]|2019-03-02 14:01:05|
+--------------------+---------------+------------+-------------------+



In [42]:
#select multiple columns from data frame
complex_dataframe.select('col_dict', 'col_list').show()

+--------------------+---------------+
|            col_dict|       col_list|
+--------------------+---------------+
|           [k1 -> 0]|   [1, 2, 3, 4]|
|  [k1 -> 0, k2 -> 1]|[4, 5, 3, 4, 5]|
|[k3 -> 2, k1 -> 0...|[7, 8, 3, 4, 5]|
+--------------------+---------------+



In [36]:
# reading an cell of a row, in this example i am reading row 0 and cell 1 (zero based index)
# copy of the cell is returned
listitem = complex_dataframe.collect()[0][1]
listitem.append(100)
listitem

[1, 2, 3, 4, 100]

In [37]:
# though cell is modified, the change has not affected the data frame, so the reason why RDD/Dataframe are immutable
# the original cell is not modified, 
complex_dataframe.show()

+--------------------+---------------+------------+-------------------+
|            col_dict|       col_list|     col_row|           col_time|
+--------------------+---------------+------------+-------------------+
|           [k1 -> 0]|   [1, 2, 3, 4]|[10, 20, 30]|2019-03-02 14:01:05|
|  [k1 -> 0, k2 -> 1]|[1, 2, 3, 4, 5]|[10, 20, 30]|2019-03-02 14:01:05|
|[k3 -> 2, k1 -> 0...|[1, 2, 3, 4, 5]|[10, 20, 30]|2019-03-02 14:01:05|
+--------------------+---------------+------------+-------------------+



In [45]:
# adding additional column with computation
# in this example, col_list column is accessed from the row list and col_sum is updated
complex_dataframe = complex_dataframe.select("col_list","col_row").withColumn("col_sum", complex_dataframe.col_list[0] + complex_dataframe.col_list[1])
complex_dataframe.show()

+---------------+------------+-------+
|       col_list|     col_row|col_sum|
+---------------+------------+-------+
|   [1, 2, 3, 4]|[10, 20, 30]|      3|
|[4, 5, 3, 4, 5]|[10, 20, 30]|      9|
|[7, 8, 3, 4, 5]|[10, 20, 30]|     15|
+---------------+------------+-------+



In [47]:
# to rename a column in the data frame
complex_dataframe = complex_dataframe.withColumnRenamed('col_sum', 'sum_of_2_elements_of_list')
complex_dataframe.show()

+---------------+------------+-------------------------+
|       col_list|     col_row|sum_of_2_elements_of_list|
+---------------+------------+-------------------------+
|   [1, 2, 3, 4]|[10, 20, 30]|                        3|
|[4, 5, 3, 4, 5]|[10, 20, 30]|                        9|
|[7, 8, 3, 4, 5]|[10, 20, 30]|                       15|
+---------------+------------+-------------------------+

