## Import Required modules

In [1]:
from __future__ import print_function
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

## Initialize SparkSession

In [2]:
spark = SparkSession.builder.appName("Create Dataframes").getOrCreate()

## Create Dataframe with list of tupes and explicit schema

In [11]:
from datetime import datetime, date
data = [
    (1, 1., 'abc', date(2020, 1, 1), datetime(2020, 1, 1, 12, 0)),
    (2, 2., 'xyz', date(2020, 2, 1), datetime(2020, 1, 2, 12, 0)),
    (3, 3., 'a12', date(2020, 3, 1), datetime(2020, 1, 3, 12, 0))
]

df = spark.createDataFrame(
    data, schema='col1 long, col2 double, col3 string, col4 date, col5 timestamp'
)

In [12]:
df.printSchema()

root
 |-- col1: long (nullable = true)
 |-- col2: double (nullable = true)
 |-- col3: string (nullable = true)
 |-- col4: date (nullable = true)
 |-- col5: timestamp (nullable = true)



In [13]:
df.show()

+----+----+-------+----------+-------------------+
|col1|col2|   col3|      col4|               col5|
+----+----+-------+----------+-------------------+
|   1| 2.0|string1|2000-01-01|2000-01-01 12:00:00|
|   2| 3.0|string2|2000-02-01|2000-01-02 12:00:00|
|   3| 4.0|string3|2000-03-01|2000-01-03 12:00:00|
+----+----+-------+----------+-------------------+



## Create Dataframe with list of Rows

pyspark.sql.Row : A row of data in a DataFrame.

- Row can be used to create a row object by using named arguments, the fields will be sorted by names. 
- It is not allowed to omit a named argument to represent the value is None or missing. This should be explicitly set to None in this case.

In [15]:
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(col1=1, col2=1., col3='abc', col4=date(2020, 1, 1), col5=datetime(2020, 1, 1, 12, 0)),
    Row(col1=2, col2=2., col3='xyz', col4=date(2020, 2, 1), col5=datetime(2020, 1, 2, 12, 0)),
    Row(col1=3, col2=3., col3='a12', col4=date(2020, 3, 1), col5=datetime(2020, 1, 3, 12, 0))
])

In [16]:
df.printSchema()

root
 |-- col1: long (nullable = true)
 |-- col2: double (nullable = true)
 |-- col3: string (nullable = true)
 |-- col4: date (nullable = true)
 |-- col5: timestamp (nullable = true)



In [17]:
df.show(truncate=False)

+----+----+----+----------+-------------------+
|col1|col2|col3|col4      |col5               |
+----+----+----+----------+-------------------+
|1   |1.0 |abc |2020-01-01|2020-01-01 12:00:00|
|2   |2.0 |xyz |2020-02-01|2020-01-02 12:00:00|
|3   |3.0 |a12 |2020-03-01|2020-01-03 12:00:00|
+----+----+----+----------+-------------------+



## Create DataFrame from a pandas DataFrame

In [20]:
import pandas as pd

pd_df = pd.DataFrame({
    'col1': [1, 2, 3],
    'col2': [1., 2., 4.],
    'col3': ['abc', 'xyz', 'a12'],
    'col4': [date(2020, 1, 1), date(2020, 2, 1), date(2020, 3, 1)],
    'col5': [datetime(2020, 1, 1, 12, 0), datetime(2020, 1, 2, 12, 0), datetime(2020, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pd_df)

ModuleNotFoundError: No module named 'pandas'

In [21]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.1.5-cp36-cp36m-win_amd64.whl (8.7 MB)
Collecting pytz>=2017.2
  Downloading pytz-2021.1-py2.py3-none-any.whl (510 kB)
Collecting numpy>=1.15.4
  Downloading numpy-1.19.5-cp36-cp36m-win_amd64.whl (13.2 MB)
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.19.5 pandas-1.1.5 pytz-2021.1


In [22]:
import pandas as pd

pd_df = pd.DataFrame({
    'col1': [1, 2, 3],
    'col2': [1., 2., 4.],
    'col3': ['abc', 'xyz', 'a12'],
    'col4': [date(2020, 1, 1), date(2020, 2, 1), date(2020, 3, 1)],
    'col5': [datetime(2020, 1, 1, 12, 0), datetime(2020, 1, 2, 12, 0), datetime(2020, 1, 3, 12, 0)]
})
df3 = spark.createDataFrame(pd_df)

In [23]:
df3.printSchema()

root
 |-- col1: long (nullable = true)
 |-- col2: double (nullable = true)
 |-- col3: string (nullable = true)
 |-- col4: date (nullable = true)
 |-- col5: timestamp (nullable = true)



In [24]:
df3.show(truncate=False)

+----+----+----+----------+-------------------+
|col1|col2|col3|col4      |col5               |
+----+----+----+----------+-------------------+
|1   |1.0 |abc |2020-01-01|2020-01-01 12:00:00|
|2   |2.0 |xyz |2020-02-01|2020-01-02 12:00:00|
|3   |4.0 |a12 |2020-03-01|2020-01-03 12:00:00|
+----+----+----+----------+-------------------+

