In [1]:
# Import Required modules

from __future__ import print_function
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("View Dataframes").getOrCreate()

## Create DataFrame

In [3]:
from datetime import datetime, date
data = [
    (1, 1., 'abc', date(2020, 1, 1), datetime(2020, 1, 1, 12, 0)),
    (2, 2., 'xyz', date(2020, 2, 1), datetime(2020, 1, 2, 12, 0)),
    (3, 3., 'a12', date(2020, 3, 1), datetime(2020, 1, 3, 12, 0))
]

df = spark.createDataFrame(
    data, schema='col1 long, col2 double, col3 string, col4 date, col5 timestamp'
)

In [4]:
df

DataFrame[col1: bigint, col2: double, col3: string, col4: date, col5: timestamp]

## View DataFrame

In [5]:
df.show()

+----+----+----+----------+-------------------+
|col1|col2|col3|      col4|               col5|
+----+----+----+----------+-------------------+
|   1| 1.0| abc|2020-01-01|2020-01-01 12:00:00|
|   2| 2.0| xyz|2020-02-01|2020-01-02 12:00:00|
|   3| 3.0| a12|2020-03-01|2020-01-03 12:00:00|
+----+----+----+----------+-------------------+



In [6]:
df.show(1, truncate=False)

+----+----+----+----------+-------------------+
|col1|col2|col3|col4      |col5               |
+----+----+----+----------+-------------------+
|1   |1.0 |abc |2020-01-01|2020-01-01 12:00:00|
+----+----+----+----------+-------------------+
only showing top 1 row



### Enable 'spark.sql.repl.eagerEval.enabled' configuration for the eager evaluation of PySpark DataFrame in notebooks such as Jupyter.

### The number of rows to show can be controlled via 'spark.sql.repl.eagerEval.maxNumRows' configuration.

In [7]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [8]:
df

col1,col2,col3,col4,col5
1,1.0,abc,2020-01-01,2020-01-01 12:00:00
2,2.0,xyz,2020-02-01,2020-01-02 12:00:00
3,3.0,a12,2020-03-01,2020-01-03 12:00:00


In [10]:
# The rows can also be shown vertically. This is useful when rows are too long to show horizontally.
df.show(1, vertical=True)

-RECORD 0-------------------
 col1 | 1                   
 col2 | 1.0                 
 col3 | abc                 
 col4 | 2020-01-01          
 col5 | 2020-01-01 12:00:00 
only showing top 1 row



## View Schema

In [11]:
df.printSchema()

root
 |-- col1: long (nullable = true)
 |-- col2: double (nullable = true)
 |-- col3: string (nullable = true)
 |-- col4: date (nullable = true)
 |-- col5: timestamp (nullable = true)



In [12]:
df.dtypes

[('col1', 'bigint'),
 ('col2', 'double'),
 ('col3', 'string'),
 ('col4', 'date'),
 ('col5', 'timestamp')]

In [13]:
df.columns

['col1', 'col2', 'col3', 'col4', 'col5']

## View Summary of the DataFrame

In [14]:
df.describe().show()

+-------+----+----+----+
|summary|col1|col2|col3|
+-------+----+----+----+
|  count|   3|   3|   3|
|   mean| 2.0| 2.0|null|
| stddev| 1.0| 1.0|null|
|    min|   1| 1.0| a12|
|    max|   3| 3.0| xyz|
+-------+----+----+----+



## Convert Dataframe to Pandas

In [15]:
df.toPandas()

Unnamed: 0,col1,col2,col3,col4,col5
0,1,1.0,abc,2020-01-01,2020-01-01 12:00:00
1,2,2.0,xyz,2020-02-01,2020-01-02 12:00:00
2,3,3.0,a12,2020-03-01,2020-01-03 12:00:00


## Selecting and Accessing Data

In [21]:
df.col3

Column<'col3'>

In [22]:
type(df.col3)

pyspark.sql.column.Column

In [24]:
df.select(df.col1, df.col3).show()

+----+----+
|col1|col3|
+----+----+
|   1| abc|
|   2| xyz|
|   3| a12|
+----+----+



DataFrame.select() takes the Column instances that returns another DataFrame.

In [26]:
df.select('col1','col2').show()

+----+----+
|col1|col2|
+----+----+
|   1| 1.0|
|   2| 2.0|
|   3| 3.0|
+----+----+



In [27]:
cols = ['col1','col3']
df.select(*cols).show()

+----+----+
|col1|col3|
+----+----+
|   1| abc|
|   2| xyz|
|   3| a12|
+----+----+

