# Creating DataFrames

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [2]:
myspark=SparkSession.builder.appName("Spark_DF_Operations").master("yarn")\
    .config("spark.executor.memory","4g")\
    .enableHiveSupport().getOrCreate()

## 1. Using List of Tuples [("Naresh",25)] or List of Lists [["Naresh",25]] or List of Dict [{"name":"naresh", "age":25}]


### Using List of Tuples

In [3]:
l = [('Naresh',25),('Ravi',22),('Bhanu',20),('Akash',26)]
print "\nType of Record is : ", type(('Naresh',25)) , "\n"
df = myspark.createDataFrame(l)
df.show()


Type of Record is :  <type 'tuple'> 

+------+---+
|    _1| _2|
+------+---+
|Naresh| 25|
|  Ravi| 22|
| Bhanu| 20|
| Akash| 26|
+------+---+



### Using List of Lists

In [4]:
l = [["naresh","jangra",33,170.5],["ravi","verma",30,150.5]]
print "\nType of Record is : ", type(["naresh","jangra",33,170.5]) , "\n"
myspark.createDataFrame(l).show()


Type of Record is :  <type 'list'> 

+------+------+---+-----+
|    _1|    _2| _3|   _4|
+------+------+---+-----+
|naresh|jangra| 33|170.5|
|  ravi| verma| 30|150.5|
+------+------+---+-----+



### Using List of Dicts

In [5]:
# l = [{"fname":"naresh", "lname":"jangra", "age": 33, "height": 170.5},{"fname":"ravi", "lname":"verma", "age": 53, "height": 150.6}]
# print "\nType of Record is : ", type({"fname":"naresh", "lname":"jangra", "age": 33, "height": 170.5}) , "\n"
# myspark.createDataFrame(l).show()

#1. Column must have the same type of data. For eq. for first record height is DoubleType. It should be Double for all records.
# Having int height (150) will cause DF creation failure saying: 
# TypeError: Can not merge type <class 'pyspark.sql.types.DoubleType'> and <class 'pyspark.sql.types.LongType'>

#2. /opt/cloudera/parcels/SPARK2-2.1.0.cloudera1-1.cdh5.7.0.p0.120904/lib/spark2/python/pyspark/sql/session.py:320: 
# UserWarning: inferring schema from dict is deprecated,please use pyspark.sql.Row instead
#  warnings.warn("inferring schema from dict is deprecated,"

Error if we have the string ("Naresh") instead of Tuple ("Naresh",25)

## 2. Using List of pyspark.sql.Row() Objects


A row in DataFrame. The fields in it can be accessed: like attributes (row.key) , like dictionary values (row[key])
Row can be used to create a row object by using named arguments

In [6]:
from pyspark.sql import Row
row = Row(name="Naresh", age=34)
print "\nName is : " , row.name
print "\nAge is : " ,row["age"]


Name is :  Naresh

Age is :  34


Row also can be used to create another Row like class, then it could be used to create Row objects, such as


In [7]:
Person = Row("name", "age")
Person("Naresh", 34)

Row(name='Naresh', age=34)

### Create the Dataframe from List of Row()

In [8]:
rowlist = [Row(name = "naresh", age = 33),Row(name = "ravi", age = 44)]
df = myspark.createDataFrame(rowlist)
df.show()

+---+------+
|age|  name|
+---+------+
| 33|naresh|
| 44|  ravi|
+---+------+



## 3. Using Row and list separately


In [9]:
# How to Access Tupe
tup = ("naresh",22)
print tup[0], tup[1]

naresh 22


In [10]:
l = [('Naresh',25),('Ravi',22),('Bhanu',20),('Akash',26)]
rdd = myspark.sparkContext.parallelize(l)
data = rdd.map(lambda x: Row(name=x[0], age=x[1]))
df = myspark.createDataFrame(data)
df.show()

+---+------+
|age|  name|
+---+------+
| 25|Naresh|
| 22|  Ravi|
| 20| Bhanu|
| 26| Akash|
+---+------+



## 4. Using data files of Different Formats

The data files used below is available in this repo under data folder. LOAD this file in the HDFS at /tmp directory

### Formats: csv, tsv, psv

Below is an example for csv data file. For tsv(Table separated) and psv(pipe separated) data just use option sep=" ".

In [11]:
df = myspark.read.csv("/tmp/sampledata.csv", \
                header=True, inferSchema=True, dateFormat="yyyy-MM-dd", timestampFormat="yyyy-MM-dd HH:mm:ss")

df.show(5,False)

+-------+-------+---+------+---------------------+---------------------+
|fname  |lname  |age|height|dated                |timing               |
+-------+-------+---+------+---------------------+---------------------+
|naresh |jangra |30 |170.5 |2013-10-12 00:00:00.0|2013-10-12 12:35:50.0|
|ravi   |verma  |35 |155.67|2014-10-12 00:00:00.0|2014-10-12 01:55:50.0|
|viren  | nain  |55 |160.0 |2015-10-12 00:00:00.0|2015-10-12 09:15:50.0|
|bhanu  | pratap|11 |180.8 |2016-10-12 00:00:00.0|2016-10-12 10:05:50.0|
+-------+-------+---+------+---------------------+---------------------+



OR

In [12]:
df = myspark.read.format("csv").option("header",True).option("inferSchema",True)\
    .option("dateFormat","yyyy-MM-dd").option("timestampFormat","yyyy-MM-dd HH:mm:ss")\
    .option("path","/tmp/sampledata.csv").load()
    
df.show(10,False)

+-------+-------+---+------+---------------------+---------------------+
|fname  |lname  |age|height|dated                |timing               |
+-------+-------+---+------+---------------------+---------------------+
|naresh |jangra |30 |170.5 |2013-10-12 00:00:00.0|2013-10-12 12:35:50.0|
|ravi   |verma  |35 |155.67|2014-10-12 00:00:00.0|2014-10-12 01:55:50.0|
|viren  | nain  |55 |160.0 |2015-10-12 00:00:00.0|2015-10-12 09:15:50.0|
|bhanu  | pratap|11 |180.8 |2016-10-12 00:00:00.0|2016-10-12 10:05:50.0|
+-------+-------+---+------+---------------------+---------------------+



OR

In [13]:
df = myspark.read.format("csv").options(header=True, inferSchema=True, sep=",",
    dateFormat="yyyy-MM-dd", timestampFormat="yyyy-MM-dd HH:mm:ss", ignoreLeadingWhiteSpace=True,\
    ignoreTrailingWhiteSpace=True, path="/tmp/sampledata.csv").load()

df.show(10,False)

+------+------+---+------+---------------------+---------------------+
|fname |lname |age|height|dated                |timing               |
+------+------+---+------+---------------------+---------------------+
|naresh|jangra|30 |170.5 |2013-10-12 00:00:00.0|2013-10-12 12:35:50.0|
|ravi  |verma |35 |155.67|2014-10-12 00:00:00.0|2014-10-12 01:55:50.0|
|viren |nain  |55 |160.0 |2015-10-12 00:00:00.0|2015-10-12 09:15:50.0|
|bhanu |pratap|11 |180.8 |2016-10-12 00:00:00.0|2016-10-12 10:05:50.0|
+------+------+---+------+---------------------+---------------------+



Schema inferred using "inferSchema=True"

In [14]:
df.printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height: double (nullable = true)
 |-- dated: timestamp (nullable = true)
 |-- timing: timestamp (nullable = true)



### Formats: text
    
    You can simply use spark.read.csv() and just set the sep " ".

In [15]:
# Removing the Header First
file = myspark.sparkContext.textFile("/tmp/sampledata.txt")
fileheader = file.first()
header = myspark.sparkContext.parallelize([fileheader])
rdd = file.subtract(header)

# Loading the data in required format. You can always change cast the Datatype later
line = rdd.map(lambda x: x.split(" ")).map(lambda t: (t[0],t[1],int(t[2]),float(t[3]),t[4],t[5]))
df = myspark.createDataFrame(line,["name","gender","age","height","dated","timing"])
df.show()

+------+------+---+------+----------+----------+
|  name|gender|age|height|     dated|    timing|
+------+------+---+------+----------+----------+
|  ravi| verma| 35|155.67|2014-10-12|2014-10-12|
| bhanu|pratap| 11| 180.8|2016-10-12|2016-10-12|
| viren|  nain| 55| 160.0|2015-10-12|2015-10-12|
|naresh|jangra| 30| 170.5|2013-10-12|2013-10-12|
+------+------+---+------+----------+----------+



### Format: ORC and Parquet

Saving the previous df as parquet and orc formats in HDFS

In [16]:
df.write.save(path = "/tmp/sampledata_parq", format="parquet", mode="overwrite")
df.write.save(path = "/tmp/sampledata_orc", format="orc", mode="overwrite")

Reading the Parquet Data as DataFrame

In [17]:
df = myspark.read.parquet("/tmp/sampledata_parq")
df.show(10,False)

+------+------+---+------+----------+----------+
|name  |gender|age|height|dated     |timing    |
+------+------+---+------+----------+----------+
|viren |nain  |55 |160.0 |2015-10-12|2015-10-12|
|naresh|jangra|30 |170.5 |2013-10-12|2013-10-12|
|bhanu |pratap|11 |180.8 |2016-10-12|2016-10-12|
|ravi  |verma |35 |155.67|2014-10-12|2014-10-12|
+------+------+---+------+----------+----------+



Schema information will be taken from Parquet. You can cast to required Datatype.

In [18]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: double (nullable = true)
 |-- dated: string (nullable = true)
 |-- timing: string (nullable = true)



Reading the Orc Data as DataFrame

In [19]:
df = myspark.read.orc("/tmp/sampledata_orc")
df.show(10,False)

+------+------+---+------+----------+----------+
|name  |gender|age|height|dated     |timing    |
+------+------+---+------+----------+----------+
|viren |nain  |55 |160.0 |2015-10-12|2015-10-12|
|naresh|jangra|30 |170.5 |2013-10-12|2013-10-12|
|bhanu |pratap|11 |180.8 |2016-10-12|2016-10-12|
|ravi  |verma |35 |155.67|2014-10-12|2014-10-12|
+------+------+---+------+----------+----------+



Schema information will be taken from Orc. You can cast to required Datatype.

In [20]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: double (nullable = true)
 |-- dated: string (nullable = true)
 |-- timing: string (nullable = true)



### Format: json

In [21]:
df = myspark.read.json("/tmp/sampledata.json")
df.show()

+---+----------+------+------+------+-------------------+
|age|     dated| fname|height| lname|             timing|
+---+----------+------+------+------+-------------------+
| 30|2013-10-12|naresh| 170.5|jangra|2013-10-12 12:35:50|
| 35|2014-10-12|  ravi|155.67| verma|2014-10-12 01:55:50|
| 55|2015-10-12| viren| 160.0|  nain|2015-10-12 09:15:50|
| 11|2016-10-12| bhanu| 180.8|pratap|2016-10-12 10:05:50|
+---+----------+------+------+------+-------------------+



Or just like csv data reading

In [22]:
df = myspark.read.format("json").options(header=True, inferSchema=True,
    dateFormat="yyyy-MM-dd", timestampFormat="yyyy-MM-dd HH:mm:ss", ignoreLeadingWhiteSpace=True,\
    ignoreTrailingWhiteSpace=True, path="/tmp/sampledata.json").load()

df.show(10,False)

+---+----------+------+------+------+-------------------+
|age|dated     |fname |height|lname |timing             |
+---+----------+------+------+------+-------------------+
|30 |2013-10-12|naresh|170.5 |jangra|2013-10-12 12:35:50|
|35 |2014-10-12|ravi  |155.67|verma |2014-10-12 01:55:50|
|55 |2015-10-12|viren |160.0 |nain  |2015-10-12 09:15:50|
|11 |2016-10-12|bhanu |180.8 |pratap|2016-10-12 10:05:50|
+---+----------+------+------+------+-------------------+



Schema information will be inferred(as much possible). You can cast to required Datatype.

In [23]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- dated: string (nullable = true)
 |-- fname: string (nullable = true)
 |-- height: double (nullable = true)
 |-- lname: string (nullable = true)
 |-- timing: string (nullable = true)



## 4. Using Hive Tables
    
Simply read any existing table using myspark.sql



# Applying SCHEMA on the Data


When we use pyspark.sql.Row(), or read data from any data file source, we can always get/infer the Schema(). 

Sometime, we may need to apply Schema separately.

## 1. Schema by passing a List of Col Names.

When schema is a list of column names, the type of each column will be inferred from data.

Let us apply this on the same df we generated in the very first step

In [24]:
#data = [["naresh","jangra",33,170.5],["Ravi","verma",20,150.5]]
#or
data = [("naresh","jangra",33,170.5),("Ravi","verma",20,150.5)]
type(("naresh","jangra",33,170.5))

tuple

In [25]:
df = myspark.createDataFrame(data,["fname","lname","age","height"])
df.show()

+------+------+---+------+
| fname| lname|age|height|
+------+------+---+------+
|naresh|jangra| 33| 170.5|
|  Ravi| verma| 20| 150.5|
+------+------+---+------+



In [26]:
df.printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: double (nullable = true)



## 2. Passing Scehma using pyspark.sql.types.StructType()

In [27]:
data = [["naresh","jangra",33,170.5],["Ravi","verma",20,150.5]]

In [28]:
#from pyspark.sql import types as T
schema = T.StructType([
                T.StructField("fname", T.StringType(), True),
                T.StructField("lname", T.StringType(), True),
                T.StructField("age", T.IntegerType(), True),
                T.StructField("height", T.DoubleType(), True)
            ])

In [29]:
df = myspark.createDataFrame(data,schema)
df.show()

+------+------+---+------+
| fname| lname|age|height|
+------+------+---+------+
|naresh|jangra| 33| 170.5|
|  Ravi| verma| 20| 150.5|
+------+------+---+------+



In [30]:
df.printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height: double (nullable = true)



## 3. Inferring Scehma from Data File (with Seperators like "," , "|", "\t", " " etc)

    We have already done this.

In [31]:
df = myspark.read.format("csv").options(header=True, inferSchema=True, sep=" ",
    dateFormat="yyyy-MM-dd", timestampFormat="yyyy-MM-dd HH:mm:ss", ignoreLeadingWhiteSpace=True,\
    ignoreTrailingWhiteSpace=True, path="/tmp/sampledata.txt").load()

df.show(10,False)

+------+------+---+------+---------------------+---------------------+
|fname |lname |age|height|dated                |timing               |
+------+------+---+------+---------------------+---------------------+
|naresh|jangra|30 |170.5 |2013-10-12 00:00:00.0|2013-10-12 00:00:00.0|
|ravi  |verma |35 |155.67|2014-10-12 00:00:00.0|2014-10-12 00:00:00.0|
|viren |nain  |55 |160.0 |2015-10-12 00:00:00.0|2015-10-12 00:00:00.0|
|bhanu |pratap|11 |180.8 |2016-10-12 00:00:00.0|2016-10-12 00:00:00.0|
+------+------+---+------+---------------------+---------------------+



In [32]:
df.printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height: double (nullable = true)
 |-- dated: timestamp (nullable = true)
 |-- timing: timestamp (nullable = true)



You can always do the formating (Changing format of timing Column) and casting (from timestamp to DateType)

In [33]:
df = df.withColumn("dated", F.to_date("dated").cast(T.DateType()))\
 .withColumn("timing", F.from_unixtime(F.unix_timestamp("timing"),"yyyy-MM-dd HH:mm:ss"))

In [34]:
df.show(5,False)

+------+------+---+------+----------+-------------------+
|fname |lname |age|height|dated     |timing             |
+------+------+---+------+----------+-------------------+
|naresh|jangra|30 |170.5 |2013-10-12|2013-10-12 00:00:00|
|ravi  |verma |35 |155.67|2014-10-12|2014-10-12 00:00:00|
|viren |nain  |55 |160.0 |2015-10-12|2015-10-12 00:00:00|
|bhanu |pratap|11 |180.8 |2016-10-12|2016-10-12 00:00:00|
+------+------+---+------+----------+-------------------+



# What's Next

1) To Download this Single Notebook, go to Click this file in my Github Account, Copy the URL and paste in http://nbviewer.jupyter.org/. Download button will be in top right corner.

2) Open your Juypter Notebook home page and upload using "upload" Button.

3) Continue Learning from the next Notebook Spark_03_Saving_DataFrames.ipynb