# Pyspark - DataFrames

In [2]:
'''You can manually create a PySpark DataFrame using 
1. toDF() - Convertng from RDDs to Dataframe
2. createDataFrame()  - Creatng dataframe from Collections like list, seq,..etc
3. from data sources like TXT, CSV, JSON, ORV, Avro, Parquet, XML formats by reading from HDFS, S3, DBFS, Azure Blob file systems e.t.c.
'''

'You can manually create a PySpark DataFrame using \n1. toDF() - Convertng from RDDs to Dataframe\n2. createDataFrame()  - Creatng dataframe from Collections like list, seq,..etc\n3. from data sources like TXT, CSV, JSON, ORV, Avro, Parquet, XML formats by reading from HDFS, S3, DBFS, Azure Blob file systems e.t.c.\n'

In [3]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('Pyspark-Examples') \
                    .getOrCreate() 

In [4]:
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]

rdd = spark.sparkContext.parallelize(data)

# 1. Using toDF() function

In [5]:
dfFromRDD1 = rdd.toDF()
dfFromRDD1.printSchema()


root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)



In [6]:
columns = ["language","users_count"]
dfFromRDD1 = rdd.toDF(columns)
dfFromRDD1.printSchema()


root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)



# 2. Using createDataFrame() from SparkSession

In [15]:

dfFromRDD2 = spark.createDataFrame(rdd).toDF(*columns)
dfFromRDD2.printSchema()

dfFromRDD3 = spark.createDataFrame(rdd).toDF('lang', 'cnt')
dfFromRDD3.printSchema()

root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)

root
 |-- lang: string (nullable = true)
 |-- cnt: string (nullable = true)



# 3. Create DataFrame with schema

In [13]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
data2 = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data2,schema=schema)
df.printSchema()
df.show(truncate=False)


root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



# 4. Create DataFrame from Data sources

In [None]:

#df2 = spark.read.csv("/src/resources/file.csv")
#df3 = spark.read.text("/src/resources/file.txt")
#df4 = spark.read.json("/src/resources/file.json")

'''OR Using below methods'''

# orc/csv/avro/parquet/Kafka/jdbc
#df5 = spark.read.format('avro').load('/src/resources/file.avsc')  #pass file type as argument orc/csv/avro/parquet/Kafka/jdbc


# 5. Create Empty Dataframe

In [16]:

#Create empty DatFrame with no schema (no columns)
df3 = spark.createDataFrame([], StructType([]))
df3.printSchema()

#print below empty schema
#root


root

