# Converting Date to Spark DataFrame and to Pandas DataFrame

In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType

# Creating the SparkSession
spark = SparkSession.builder.appName('test').master("local").getOrCreate()

# Creating a list 
dept = [("Finance",10),("Marketing",20),("Sales",30),("IT",40)]
print('Type of Dept:', type(dept))

# Converting the list to RDD
rdd = spark.sparkContext.parallelize(dept)
print('Type of RDD:', type(rdd))

# Converting RDD to Spark Dataframe
df = rdd.toDF()
# print(df.printSchema())
# print('Print the DataFrame:', df.show())
# print('Print the DF Truncate is false:', df.show(truncate=False))

deptColumns = ["dept_name","dept_id"]
df2 = rdd.toDF(deptColumns)
# df2.printSchema()
# print(df2.show())
# df2.show(truncate=False)

# Creating DataFrame using createdataframe()
deptDF = spark.createDataFrame(rdd, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

deptSchema = StructType([
    StructField('dept_name', StringType(), True),
    StructField('dept_id', StringType(), True)])

deptDF1 = spark.createDataFrame(rdd, schema = deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate=False)

# Convert the Spark DataFrame back to a pandas DataFrame using Arrow
result_pdf = deptDF1.select("*").toPandas()
print(result_pdf)
print(type(result_pdf))

Type of Dept: <class 'list'>
Type of RDD: <class 'pyspark.rdd.RDD'>
root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: string (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

   dept_name dept_id
0    Finance      10
1  Marketing      20
2      Sales      30
3         IT      40
<class 'pandas.core.frame.DataFrame'>


# Pandas_DataFrame to Spark_DataFrame

In [35]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd

# Creating Spark Session
spark = SparkSession.builder.appName("SparkDFtoPandasDF").master("local").getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Generate a pandas DataFrame
pdf = pd.DataFrame(np.random.rand(10, 3))
# print(pdf)

# Create a Spark DataFrame from a Pandas DataFrame using Arrow
sdf = spark.createDataFrame(pdf)
print(sdf.show())
print('TYpe of df:', type(sdf))

# Convert the Spark DataFrame back to a pandas DataFrame using Arrow
result_pdf = sdf.select("*").toPandas()
print(result_pdf)
print('TYpe of result_pdf:', type(result_pdf))

+-------------------+-------------------+-------------------+
|                  0|                  1|                  2|
+-------------------+-------------------+-------------------+
| 0.3808938518514766|0.24926725694913243| 0.7876664096198096|
|   0.67340984507219|0.09393267818780915|0.17317978907063503|
| 0.6122485744219693| 0.7089185452832107| 0.6977537865821086|
| 0.9649373422850355|0.02949278375756803| 0.7009717218583225|
|0.21491857683971338| 0.1947534238002514| 0.6050142280044305|
| 0.3963480711496761|0.25348755637772313| 0.2932977515942553|
|  0.311324988383484| 0.7909107193493403|0.39680161417902027|
|0.05436951351086594| 0.8068657712936341| 0.8645587328424736|
| 0.5371481810262208|0.02548339847711878| 0.4803846945606256|
| 0.5635897908186178|0.05685372945364697|0.07251783812343349|
+-------------------+-------------------+-------------------+

None
TYpe of df: <class 'pyspark.sql.dataframe.DataFrame'>
          0         1         2
0  0.380894  0.249267  0.787666
1  0.673

In [38]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('DataFrame_Hdfs').getOrCreate()
data_df = spark.read.csv('C:\\Project\\Files\\Input\\csv\\Sample.csv' ,inferSchema=True, header=True)

# data_df = spark.read.format('csv').options(inferSchema=True, header=True,delimiter=',').csv('C:\\Project\\Files\\Input\\csv\\Sample.csv')

# print(data_df.show())

# print('Schema:',data_df.printSchema())
# print('The partition isze:',data_df.rdd.getNumPartitions())

# print('Default Parallelism:',spark.sparkContext.defaultParallelism)

pdf = data_df.select("*").toPandas()
print(pdf)
print(type(pdf))

    OrderDate   Region       Rep     Item  Units  UnitCost     Total
0    1/6/2018     East     Jones   Pencil     95      1.99    189.05
1   1/23/2018  Central    Kivell   Binder     50     19.99     999.5
2    2/9/2018  Central   Jardine   Pencil     36      4.99    179.64
3   2/26/2018  Central      Gill      Pen     27     19.99    539.73
4   3/15/2018     West   Sorvino   Pencil     56      2.99    167.44
5    4/1/2018     East     Jones   Binder     60      4.99     299.4
6   4/18/2018  Central   Andrews   Pencil     75      1.99    149.25
7    5/5/2018  Central   Jardine   Pencil     90      4.99     449.1
8   5/22/2018     West  Thompson   Pencil     32      1.99     63.68
9    6/8/2018     East     Jones   Binder     60      8.99     539.4
10  6/25/2018  Central    Morgan   Pencil     90      4.99     449.1
11  7/12/2018     East    Howard   Binder     29      1.99     57.71
12  7/29/2018     East    Parent   Binder     81     19.99  1,619.19
13  8/15/2018     East     Jones  