## Creating a RDD using spark.createDataFrame

In [1]:
# very important step
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark = SparkSession.builder.appName("practice").getOrCreate()

data = [
('James','','Smith','1991-04-01','M',3000)        
]

columns = ["firstName", "middleName", "lastName", "dob", "gender", "salary"]

df = spark.createDataFrame(data=data, schema=columns)
df.printSchema()
df.show()

root
 |-- firstName: string (nullable = true)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstName|middleName|lastName|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
+---------+----------+--------+----------+------+------+



# Creating a RDD using spark.read.csv

In [2]:
df2 = spark.read.csv('new_sales_data.csv', header=True, inferSchema=True)
df2.show()
df2.printSchema()
df2.collect()

+----------+-------+------+--------+
|      Date|Product|Amount|Customer|
+----------+-------+------+--------+
|2022-01-01| Apples|   100|    John|
|2022-01-01|Oranges|   150|   Alice|
|2022-01-02|Bananas|    80|     Bob|
|2022-01-02| Apples|   120|     Eve|
+----------+-------+------+--------+

root
 |-- Date: date (nullable = true)
 |-- Product: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Customer: string (nullable = true)



[Row(Date=datetime.date(2022, 1, 1), Product='Apples', Amount=100, Customer='John'),
 Row(Date=datetime.date(2022, 1, 1), Product='Oranges', Amount=150, Customer='Alice'),
 Row(Date=datetime.date(2022, 1, 2), Product='Bananas', Amount=80, Customer='Bob'),
 Row(Date=datetime.date(2022, 1, 2), Product='Apples', Amount=120, Customer='Eve')]

## Creating a RDD using sparkContext.parallelize()

In [3]:
dataList = [('Java',1000),('Python',20000),('Scala',24004)]
rdd1= spark.sparkContext.parallelize(dataList).toDF(['Language', 'Cost'])

In [4]:
rdd1.show()

+--------+-----+
|Language| Cost|
+--------+-----+
|    Java| 1000|
|  Python|20000|
|   Scala|24004|
+--------+-----+



In [5]:
rdd1.collect()

[Row(Language='Java', Cost=1000),
 Row(Language='Python', Cost=20000),
 Row(Language='Scala', Cost=24004)]

## Creating RDD using SparkContext.textFile

In [6]:
lines = spark.read.text("salary.txt")
llist = lines.collect()
# printing the list
for line in llist:
  print(line)

Row(value="'Name','Amount'")
Row(value="'James',2000")
Row(value="'David',3000")
Row(value="'Ron',9000")
