In [1]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("PySpark Test").getOrCreate()

# Create a simple DataFrame
data = [("Alice", 25), ("Bob", 30), ("Cathy", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()


+-----+---+
| Name|Age|
+-----+---+
|Alice| 25|
|  Bob| 30|
|Cathy| 35|
+-----+---+



## Setting up another dataframe

In [2]:
df_range_1 = spark.range(5)
df_range_1.show(5, truncate = False)

+---+
|id |
+---+
|0  |
|1  |
|2  |
|3  |
|4  |
+---+



In [3]:
df_range_2 = spark.range(start = 1, end = 10, step = 2)
df_range_2.show(10, False)

+---+
|id |
+---+
|1  |
|3  |
|5  |
|7  |
|9  |
+---+



In [4]:
_data = [
 ["1", "Sam"],
 ["2", "Liam"],
 ["3", "Josh"],
 ["4", None]
 ]
# Create the list of column names
_cols = ["id", "name"]

# Create Data Frame using the createDataFrame method
df_users = spark.createDataFrame(data = _data, schema=_cols)
df_users.printSchema()

# Check Data Frame
df_users.show(truncate=False)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)

+---+----+
|id |name|
+---+----+
|1  |Sam |
|2  |Liam|
|3  |Josh|
|4  |NULL|
+---+----+



### Converting into RDD and then to DF

In [5]:
_data_rdd = spark.sparkContext.parallelize(_data)
_data_rdd.collect()

_data_rdd.getNumPartitions()

16

In [6]:
df_users_new = _data_rdd.toDF(_cols)
df_users_new.show()

+---+----+
| id|name|
+---+----+
|  1| Sam|
|  2|Liam|
|  3|Josh|
|  4|NULL|
+---+----+



### Creating datatype schema from string

In [7]:
from pyspark.sql.types import _parse_datatype_string
# Create new Schema for data
_schema_str = "id int, name string"
_schema = _parse_datatype_string(_schema_str)
print(_schema)

StructType([StructField('id', IntegerType(), True), StructField('name', StringType(), True)])


In [8]:
_schema_str2 = "id int not null, name double,subjects string"
_schema2 = _parse_datatype_string(_schema_str2)

In [9]:
print(_schema2)

StructType([StructField('id', IntegerType(), False), StructField('name', DoubleType(), True), StructField('subjects', StringType(), True)])


### Converting map or arryas into schema

In [10]:
_schema_str3 = "id int, name map<string, string>, subject array<string>"
_schema3 = _parse_datatype_string(_schema_str3)
print(_schema3)

StructType([StructField('id', IntegerType(), True), StructField('name', MapType(StringType(), StringType(), True), True), StructField('subject', ArrayType(StringType(), True), True)])
