# Converting Date to Spark DataFrame and then to Pandas DataFrame

In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType

# Creating the SparkSession
spark = SparkSession.builder.appName('test').master("local").getOrCreate()

# Creating a list 
dept = [("Finance",10),("Marketing",20),("Sales",30),("IT",40)]
print('Type of Dept:', type(dept))

# Converting the list to RDD
rdd = spark.sparkContext.parallelize(dept)
print('Type of RDD:', type(rdd))

# Converting RDD to Spark Dataframe
df = rdd.toDF()
# print(df.printSchema())
# print('Print the DataFrame:', df.show())
# print('Print the DF Truncate is false:', df.show(truncate=False))

deptColumns = ["dept_name","dept_id"]
df2 = rdd.toDF(deptColumns)
# df2.printSchema()
# print(df2.show())
# df2.show(truncate=False)

# Creating DataFrame using createdataframe()
deptDF = spark.createDataFrame(rdd, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

deptSchema = StructType([
    StructField('dept_name', StringType(), True),
    StructField('dept_id', StringType(), True)])

deptDF1 = spark.createDataFrame(rdd, schema = deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate=False)

# Convert the Spark DataFrame back to a pandas DataFrame using Arrow
result_pdf = deptDF1.select("*").toPandas()
print(result_pdf)
print(type(result_pdf))

Type of Dept: <class 'list'>
Type of RDD: <class 'pyspark.rdd.RDD'>
root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: string (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

   dept_name dept_id
0    Finance      10
1  Marketing      20
2      Sales      30
3         IT      40
<class 'pandas.core.frame.DataFrame'>


# Converting 

In [23]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd

# Creating Spark Session
spark = SparkSession.builder.appName("SparkDFtoPandasDF").master("local").getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Generate a pandas DataFrame
pdf = pd.DataFrame(np.random.rand(10, 3))
print(pdf)

# Create a Spark DataFrame from a Pandas DataFrame using Arrow
df = spark.createDataFrame(pdf)
print('TYpe of df:', type(df))

# Convert the Spark DataFrame back to a pandas DataFrame using Arrow
result_pdf = df.select("*").toPandas()
print(result_pdf)
print('TYpe of result_pdf:', type(result_pdf))

          0         1         2
0  0.359982  0.413753  0.879132
1  0.863343  0.795376  0.388219
2  0.086843  0.809729  0.011066
3  0.656084  0.345846  0.514123
4  0.161523  0.639518  0.679849
5  0.236000  0.312934  0.322354
6  0.793412  0.624603  0.760949
7  0.202881  0.473225  0.555773
8  0.554594  0.288513  0.623083
9  0.313588  0.143480  0.267848
TYpe of df: <class 'pyspark.sql.dataframe.DataFrame'>
          0         1         2
0  0.359982  0.413753  0.879132
1  0.863343  0.795376  0.388219
2  0.086843  0.809729  0.011066
3  0.656084  0.345846  0.514123
4  0.161523  0.639518  0.679849
5  0.236000  0.312934  0.322354
6  0.793412  0.624603  0.760949
7  0.202881  0.473225  0.555773
8  0.554594  0.288513  0.623083
9  0.313588  0.143480  0.267848
TYpe of result_pdf: <class 'pandas.core.frame.DataFrame'>


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('DataFrame_Hdfs').getOrCreate()
data_df = spark.read.csv('C:\\Project\\Files\\Input\\csv\\Sample.csv' ,inferSchema=True, header=True)

# data_df = spark.read.format('csv').options(inferSchema=True, header=True,delimiter=',').csv('C:\\Project\\Files\\Input\\csv\\Sample.csv')

print(data_df.show())

print('Schema:',data_df.printSchema())
# print('The partition isze:',data_df.rdd.getNumPartitions())

# print('Default Parallelism:',spark.sparkContext.defaultParallelism)