In [1]:
# creating spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()


In [2]:
# create a dataframe using a list of rows
from datetime import datetime, date
import pandas as pd # type: ignore
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [3]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [4]:
# create a dataframe using explit schema
df = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [5]:
# Create a PySpark DataFrame from a pandas DataFrame

pandas_df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [6]:
# All DataFrames above result same.
df.show()
df.printSchema()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



# To View the data 

In [7]:
# The top rows of a DataFrame can be displayed using DataFrame.show().
df.show(1)

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+
only showing top 1 row



In [8]:
# Alternatively, you can enable spark.sql.repl.eagerEval.enabled 
# configuration for the eager evaluation of PySpark DataFrame in notebooks such as Jupyter.
#  The number of rows to show can be controlled via spark.sql.repl.eagerEval.maxNumRows configuration.

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df


a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
3,4.0,string3,2000-03-01,2000-01-03 12:00:00


In [9]:
# The rows can also be shown vertically. This is useful when rows are too long to show horizontally.
df.show(1, vertical=True)

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2000-01-01          
 e   | 2000-01-01 12:00:00 
only showing top 1 row



In [10]:
# to see the DataFrameâ€™s schema and column names as follows
df.columns 
# df.printSchema()

['a', 'b', 'c', 'd', 'e']

In [11]:
# Show the summary of the DataFrame
df.describe().show()

+-------+---+---+-------+
|summary|  a|  b|      c|
+-------+---+---+-------+
|  count|  3|  3|      3|
|   mean|2.0|3.0|   null|
| stddev|1.0|1.0|   null|
|    min|  1|2.0|string1|
|    max|  3|4.0|string3|
+-------+---+---+-------+



In [12]:
# collect() vs tail() vs take()

df.collect()

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [13]:
df.tail(1)

[Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [14]:
df.take(1)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))]

In [15]:
# to pandas  spark.sql.execution.arrow.pyspark.enabled=True
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1'),
    Row(a=2, b=3., c='string2'),
    Row(a=4, b=5., c='string3')
])
df.toPandas()

  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


Unnamed: 0,a,b,c
0,1,2.0,string1
1,2,3.0,string2
2,4,5.0,string3


In [16]:
df.select(df.c).show()

+-------+
|      c|
+-------+
|string1|
|string2|
|string3|
+-------+



In [17]:
from pyspark.sql import Column
from pyspark.sql.functions import upper

type(df.c) == type(upper(df.c)) == type(df.c.isNull())
# new column instance
df.withColumn('upper_c', upper(df.c)).show()

+---+---+-------+-------+
|  a|  b|      c|upper_c|
+---+---+-------+-------+
|  1|2.0|string1|STRING1|
|  2|3.0|string2|STRING2|
|  4|5.0|string3|STRING3|
+---+---+-------+-------+



In [18]:
# To select a subset of rows, use DataFrame.filter().
df.filter(df.a == 1).show()

+---+---+-------+
|  a|  b|      c|
+---+---+-------+
|  1|2.0|string1|
+---+---+-------+

